diff --git a/R-package/R/mlp.R b/R-package/R/mlp.R
deleted file mode 100644
index 3cfa06e967ce..000000000000
--- a/R-package/R/mlp.R
+++ /dev/null
@@ -1,82 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-#' Convenience interface for multiple layer perceptron
-#' 
-#' @param data the input matrix. Only mx.io.DataIter and R array/matrix types supported.
-#' @param label the training label. Only R array type supported.
-#' @param hidden_node a vector containing number of hidden nodes on each hidden layer as well as the output layer.
-#' @param out_node the number of nodes on the output layer.
-#' @param dropout a number in [0,1) containing the dropout ratio from the last hidden layer to the output layer.
-#' @param activation either a single string or a vector containing the names of the activation functions.
-#' @param out_activation a single string containing the name of the output activation function.
-#' @param ctx whether train on cpu (default) or gpu.
-#' @param eval.metric the evaluation metric/
-#' @param ... other parameters passing to \code{mx.model.FeedForward.create}/
-#' 
-#' @examples
-#' 
-#' require(mlbench)
-#' data(Sonar, package="mlbench")
-#' Sonar[,61] = as.numeric(Sonar[,61])-1
-#' train.ind = c(1:50, 100:150)
-#' train.x = data.matrix(Sonar[train.ind, 1:60])
-#' train.y = Sonar[train.ind, 61]
-#' test.x = data.matrix(Sonar[-train.ind, 1:60])
-#' test.y = Sonar[-train.ind, 61]
-#' model = mx.mlp(train.x, train.y, hidden_node = 10, out_node = 2, out_activation = "softmax", 
-#'                learning.rate = 0.1)
-#' preds = predict(model, test.x)
-#' 
-#' @export
-mx.mlp <- function(data, label, hidden_node = 1, out_node, dropout = NULL, 
-                   activation = "tanh", out_activation = "softmax",
-                   ctx = mx.ctx.default(), ...) {
-  
-  m <- length(hidden_node)
-  if (!is.null(dropout)) {
-    if (length(dropout) != 1) {
-      stop("only accept dropout ratio of length 1.")
-    }
-    dropout = max(0,min(dropout, 1-1e-7))
-  }
-  
-  # symbol construction
-  act <- mx.symbol.Variable("data")
-  if (length(activation) == 1) {
-    activation <- rep(activation, m)
-  } else {
-    if (length(activation) != m) {
-      stop(paste("Length of activation should be",m))
-    }
-  }
-  for (i in seq_len(m)) {
-    fc <- mx.symbol.FullyConnected(act, num_hidden=hidden_node[i])
-    act <- mx.symbol.Activation(fc, act_type=activation[i])
-    if (i == m && !is.null(dropout)) {
-      act <- mx.symbol.Dropout(act, p = dropout)
-    }
-  }
-  fc <- mx.symbol.FullyConnected(act, num_hidden=out_node)
-  out <- switch(out_activation,
-                "rmse" = mx.symbol.LinearRegressionOutput(fc),
-                "softmax" = mx.symbol.SoftmaxOutput(fc),
-                "logistic" = mx.symbol.LogisticRegressionOutput(fc),
-                stop("Not supported yet."))
-  model <- mx.model.FeedForward.create(out, X=data, y=label, ctx = ctx, ...)
-  return(model)
-}
diff --git a/R-package/R/rnn.graph.R b/R-package/R/rnn.graph.R
deleted file mode 100644
index 1225fa511b51..000000000000
--- a/R-package/R/rnn.graph.R
+++ /dev/null
@@ -1,372 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-#' Generate a RNN symbolic model - requires CUDA
-#' 
-#' @param config Either seq-to-one or one-to-one
-#' @param cell_type Type of RNN cell: either gru or lstm
-#' @param num_rnn_layer int, number of stacked layers
-#' @param num_hidden int, size of the state in each RNN layer
-#' @param num_embed  int, default = NULL - no embedding. Dimension of the embedding vectors
-#' @param num_decode int, number of output variables in the decoding layer
-#' @param input_size int, number of levels in the data - only used for embedding
-#' @param dropout
-#' 
-#' @export
-rnn.graph <- function (num_rnn_layer, input_size = NULL, num_embed = NULL, 
-                       num_hidden, num_decode, dropout = 0, ignore_label = -1, bidirectional = F, 
-                       loss_output = NULL, config, cell_type, masking = F, output_last_state = F,
-                       rnn.state = NULL, rnn.state.cell = NULL, prefix = "") {
-  
-  data <- mx.symbol.Variable("data")
-  label <- mx.symbol.Variable("label")
-  seq.mask <- mx.symbol.Variable("seq.mask")
-  if (!is.null(num_embed)) 
-    embed.weight <- mx.symbol.Variable("embed.weight")
-  rnn.params.weight <- mx.symbol.Variable("rnn.params.weight")
-  
-  if (is.null(rnn.state)) rnn.state <- mx.symbol.Variable("rnn.state")
-  if (cell_type == "lstm" & is.null(rnn.state.cell)) {
-    rnn.state.cell <- mx.symbol.Variable("rnn.state.cell")
-  }
-  
-  cls.weight <- mx.symbol.Variable("cls.weight")
-  cls.bias <- mx.symbol.Variable("cls.bias")
-  if (!is.null(num_embed)) {
-    data <- mx.symbol.Embedding(data = data, input_dim = input_size, 
-                                weight = embed.weight, output_dim = num_embed, name = "embed")
-  }
-  
-  data = mx.symbol.swapaxes(data = data, dim1 = 0, dim2 = 1, name = paste0(prefix, "swap_pre"))
-  
-  if (cell_type == "lstm") {
-    rnn <- mx.symbol.RNN(data = data, state = rnn.state, 
-                         state_cell = rnn.state.cell, parameters = rnn.params.weight, 
-                         state.size = num_hidden, num.layers = num_rnn_layer, 
-                         bidirectional = bidirectional, mode = cell_type, state.outputs = output_last_state, 
-                         p = dropout, name = paste0(prefix, "RNN"))
-  } else {
-    rnn <- mx.symbol.RNN(data = data, state = rnn.state, 
-                         parameters = rnn.params.weight, state.size = num_hidden, 
-                         num.layers = num_rnn_layer, bidirectional = bidirectional, mode = cell_type, 
-                         state.outputs = output_last_state, p = dropout, 
-                         name = paste0(prefix, "RNN"))
-  }
-  
-  if (config == "seq-to-one") {
-    if (masking) mask <- mx.symbol.SequenceLast(data = rnn[[1]], use.sequence.length = T, sequence_length = seq.mask, name = "mask") else
-      mask <- mx.symbol.SequenceLast(data = rnn[[1]], use.sequence.length = F, name = "mask")
-    
-    if (!is.null(loss_output)) {
-      decode <- mx.symbol.FullyConnected(data = mask, weight = cls.weight, bias = cls.bias, num_hidden = num_decode, name = "decode")
-      out <- switch(loss_output, softmax = mx.symbol.SoftmaxOutput(data = decode, label = label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"), 
-                    linear = mx.symbol.LinearRegressionOutput(data = decode, label = label, name = "loss"), 
-                    logistic = mx.symbol.LogisticRegressionOutput(data = decode, label = label, name = "loss"), 
-                    MAE = mx.symbol.MAERegressionOutput(data = decode, label = label, name = "loss"))
-    }
-    else out <- mask
-  }
-  
-  else if (config == "one-to-one") {
-    
-    if (masking) mask <- mx.symbol.SequenceMask(data = rnn[[1]], use.sequence.length = T, sequence_length = seq.mask, value = 0, name = "mask") else
-      mask <- mx.symbol.identity(data = rnn[[1]], name = "mask")
-    
-    mask = mx.symbol.swapaxes(data = mask, dim1 = 0, dim2 = 1, name = paste0(prefix, "swap_post"))
-    
-    if (!is.null(loss_output)) {
-      
-      mask <- mx.symbol.reshape(data = mask, shape = c(0, -1), reverse = TRUE)
-      label <- mx.symbol.reshape(data = label, shape = c(-1))
-      
-      decode <- mx.symbol.FullyConnected(data = mask, weight = cls.weight, bias = cls.bias, num_hidden = num_decode, 
-                                         flatten = TRUE, name = paste0(prefix, "decode"))
-      
-      out <- switch(loss_output, softmax = mx.symbol.SoftmaxOutput(data = decode, label = label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = "loss"), 
-                    linear = mx.symbol.LinearRegressionOutput(data = decode, label = label, name = "loss"), 
-                    logistic = mx.symbol.LogisticRegressionOutput(data = decode, label = label, name = "loss"), 
-                    MAE = mx.symbol.MAERegressionOutput(data = decode, label = label, name = "loss"))
-    } else out <- mask
-  }
-  return(out)
-}
-
-# LSTM cell symbol
-lstm.cell <- function(num_hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0, prefix = "") {
-  
-  if (dropout > 0 && layeridx > 1) 
-    indata <- mx.symbol.Dropout(data = indata, p = dropout)
-  
-  i2h <- mx.symbol.FullyConnected(data = indata, weight = param$i2h.weight, bias = param$i2h.bias, 
-                                  num_hidden = num_hidden * 4, name = paste0(prefix, "t", seqidx, ".l", layeridx, ".i2h"))
-  
-  if (!is.null(prev.state)) {
-    h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$h2h.weight, 
-                                    bias = param$h2h.bias, num_hidden = num_hidden * 4, 
-                                    name = paste0(prefix, "t", seqidx, ".l", layeridx, ".h2h"))
-    gates <- i2h + h2h
-  } else {
-    gates <- i2h
-  }
-  
-  split.gates <- mx.symbol.split(gates, num.outputs = 4, axis = 1, squeeze.axis = F, 
-                                 name = paste0(prefix, "t", seqidx, ".l", layeridx, ".slice"))
-  
-  in.gate <- mx.symbol.Activation(split.gates[[1]], act.type = "sigmoid")
-  in.transform <- mx.symbol.Activation(split.gates[[2]], act.type = "tanh")
-  forget.gate <- mx.symbol.Activation(split.gates[[3]], act.type = "sigmoid")
-  out.gate <- mx.symbol.Activation(split.gates[[4]], act.type = "sigmoid")
-  
-  if (is.null(prev.state)) {
-    next.c <- in.gate * in.transform
-  } else {
-    next.c <- (forget.gate * prev.state$c) + (in.gate * in.transform)
-  }
-  
-  next.h <- out.gate * mx.symbol.Activation(next.c, act.type = "tanh")
-  
-  return(list(h = next.h, c = next.c))
-}
-
-
-# GRU cell symbol
-gru.cell <- function(num_hidden, indata, prev.state, param, seqidx, layeridx, dropout = 0, prefix)
-{
-  if (dropout > 0 && layeridx > 1) 
-    indata <- mx.symbol.Dropout(data = indata, p = dropout)
-  
-  i2h <- mx.symbol.FullyConnected(data = indata, weight = param$gates.i2h.weight, 
-                                  bias = param$gates.i2h.bias, num_hidden = num_hidden * 2, 
-                                  name = paste0(prefix, "t", seqidx, ".l", layeridx, ".gates.i2h"))
-  
-  if (!is.null(prev.state)) {
-    h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$gates.h2h.weight, 
-                                    bias = param$gates.h2h.bias, num_hidden = num_hidden * 2, 
-                                    name = paste0(prefix, "t", seqidx, ".l", layeridx, ".gates.h2h"))
-    gates <- i2h + h2h
-  } else {
-    gates <- i2h
-  }
-
-  split.gates <- mx.symbol.split(gates, num.outputs = 2, axis = 1, squeeze.axis = F, 
-                                 name = paste0(prefix, "t", seqidx, ".l", layeridx, ".split"))
-  
-  update.gate <- mx.symbol.Activation(split.gates[[1]], act.type = "sigmoid")
-  reset.gate <- mx.symbol.Activation(split.gates[[2]], act.type = "sigmoid")
-  
-  htrans.i2h <- mx.symbol.FullyConnected(data = indata, weight = param$trans.i2h.weight, 
-                                         bias = param$trans.i2h.bias, num_hidden = num_hidden, 
-                                         name = paste0(prefix, "t", seqidx, ".l", layeridx, ".trans.i2h"))
-  
-  if (is.null(prev.state)) {
-    h.after.reset <- reset.gate * 0
-  } else {
-    h.after.reset <- prev.state$h * reset.gate
-  }
-  
-  htrans.h2h <- mx.symbol.FullyConnected(data = h.after.reset, weight = param$trans.h2h.weight, 
-                                         bias = param$trans.h2h.bias, num_hidden = num_hidden, 
-                                         name = paste0(prefix, "t", seqidx, ".l", layeridx, ".trans.h2h"))
-  
-  h.trans <- htrans.i2h + htrans.h2h
-  h.trans.active <- mx.symbol.Activation(h.trans, act.type = "tanh")
-  
-  if (is.null(prev.state)) {
-    next.h <- update.gate * h.trans.active
-  } else {
-    next.h <- prev.state$h + update.gate * (h.trans.active - prev.state$h)
-  }
-  
-  return(list(h = next.h))
-}
-
-
-#' Unroll representation of RNN running on non CUDA device
-#' 
-#' @param config Either seq-to-one or one-to-one
-#' @param cell_type Type of RNN cell: either gru or lstm
-#' @param num_rnn_layer int, number of stacked layers
-#' @param seq_len int, number of time steps to unroll
-#' @param num_hidden int, size of the state in each RNN layer
-#' @param num_embed  int, default = NULL - no embedding. Dimension of the embedding vectors
-#' @param num_decode int, number of output variables in the decoding layer
-#' @param input_size int, number of levels in the data - only used for embedding
-#' @param dropout 
-#' 
-#' @export
-rnn.graph.unroll <- function(num_rnn_layer, 
-                             seq_len, 
-                             input_size = NULL,
-                             num_embed = NULL, 
-                             num_hidden,
-                             num_decode,
-                             dropout = 0,
-                             ignore_label = -1,
-                             loss_output = NULL, 
-                             init.state = NULL,
-                             config,
-                             cell_type = "lstm", 
-                             masking = F, 
-                             output_last_state = F,
-                             prefix = "",
-                             data_name = "data",
-                             label_name = "label") {
-  
-  if (!is.null(num_embed)) embed.weight <- mx.symbol.Variable(paste0(prefix, "embed.weight"))
-  
-  # Initial state
-  if (is.null(init.state) & output_last_state) {
-    init.state <- lapply(1:num_rnn_layer, function(i) {
-      if (cell_type=="lstm") {
-        state <- list(h = mx.symbol.Variable(paste0("init_", prefix, i, "_h")),
-                      c = mx.symbol.Variable(paste0("init_", prefix, i, "_c")))
-      } else if (cell_type=="gru") {
-        state <- list(h = mx.symbol.Variable(paste0("init_", prefix, i, "_h")))
-      }
-      return (state)
-    })
-  }
-  
-  cls.weight <- mx.symbol.Variable(paste0(prefix, "cls.weight"))
-  cls.bias <- mx.symbol.Variable(paste0(prefix, "cls.bias"))
-  
-  param.cells <- lapply(1:num_rnn_layer, function(i) {
-    
-    if (cell_type=="lstm") {
-      cell <- list(i2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".i2h.weight")),
-                   i2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".i2h.bias")),
-                   h2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".h2h.weight")),
-                   h2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".h2h.bias")))
-    } else if (cell_type=="gru") {
-      cell <- list(gates.i2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".gates.i2h.weight")),
-                   gates.i2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".gates.i2h.bias")),
-                   gates.h2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".gates.h2h.weight")),
-                   gates.h2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".gates.h2h.bias")),
-                   trans.i2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".trans.i2h.weight")),
-                   trans.i2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".trans.i2h.bias")),
-                   trans.h2h.weight = mx.symbol.Variable(paste0(prefix, "l", i, ".trans.h2h.weight")),
-                   trans.h2h.bias = mx.symbol.Variable(paste0(prefix, "l", i, ".trans.h2h.bias")))
-    }
-    return (cell)
-  })
-  
-  # embeding layer
-  data <- mx.symbol.Variable(data_name)
-  label <- mx.symbol.Variable(label_name)
-  seq.mask <- mx.symbol.Variable(paste0(prefix, "seq.mask"))
-  
-  data = mx.symbol.swapaxes(data = data, dim1 = 0, dim2 = 1, name = paste0(prefix, "swap_pre"))
-  
-  if (!is.null(num_embed)) {
-    data <- mx.symbol.Embedding(data = data, input_dim = input_size,
-                                weight=embed.weight, output_dim = num_embed, name = paste0(prefix, "embed"))
-  }
-  
-  data <- mx.symbol.split(data = data, axis = 0, num.outputs = seq_len, squeeze_axis = T)
-  
-  last.hidden <- list()
-  last.states <- list()
-  
-  for (seqidx in 1:seq_len) {
-    hidden <- data[[seqidx]]
-    
-    for (i in 1:num_rnn_layer) {
-      
-      if (seqidx==1) prev.state <- init.state[[i]] else 
-        prev.state <- last.states[[i]]
-      
-      if (cell_type=="lstm") {
-        cell.symbol <- lstm.cell
-      } else if (cell_type=="gru"){
-        cell.symbol <- gru.cell
-      }
-      
-      next.state <- cell.symbol(num_hidden = num_hidden, 
-                                indata = hidden,
-                                prev.state = prev.state,
-                                param = param.cells[[i]],
-                                seqidx = seqidx, 
-                                layeridx = i,
-                                dropout = dropout,
-                                prefix = prefix)
-      
-      hidden <- next.state$h
-      last.states[[i]] <- next.state
-    }
-    
-    # Aggregate outputs from each timestep
-    last.hidden <- c(last.hidden, hidden)
-  }
-  
-  if (output_last_state) {
-    out.states = mx.symbol.Group(unlist(last.states))
-  }
-  
-  # concat hidden units - concat seq_len blocks of dimension num_hidden x batch.size
-  concat <- mx.symbol.concat(data = last.hidden, num.args = seq_len, dim = 0, name = paste0(prefix, "concat"))
-  concat <- mx.symbol.reshape(data = concat, shape = c(num_hidden, -1, seq_len), name = paste0(prefix, "rnn_reshape"))
-  
-  if (config=="seq-to-one") {
-    
-    if (masking) mask <- mx.symbol.SequenceLast(data=concat, use.sequence.length = T, sequence_length = seq.mask, name = paste0(prefix, "mask")) else
-      mask <- mx.symbol.SequenceLast(data=concat, use.sequence.length = F, name = paste0(prefix, "mask"))
-    
-    if (!is.null(loss_output)) {
-      
-      decode <- mx.symbol.FullyConnected(data = mask,
-                                         weight = cls.weight,
-                                         bias = cls.bias,
-                                         num_hidden = num_decode,
-                                         name = paste0(prefix, "decode"))
-      
-      out <- switch(loss_output,
-                    softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, name = paste0(prefix, "loss")),
-                    linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = paste0(prefix, "loss")),
-                    logistic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, paste0(prefix, name = "loss")),
-                    MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, paste0(prefix, name = "loss"))
-      )
-    } else out <- mask
-    
-  } else if (config=="one-to-one"){
-    
-    if (masking) mask <- mx.symbol.SequenceMask(data = concat, use.sequence.length = T, sequence_length = seq.mask, value = 0, name = paste0(prefix, "mask")) else
-      mask <- mx.symbol.identity(data = concat, name = paste0(prefix, "mask"))
-    
-    mask = mx.symbol.swapaxes(data = mask, dim1 = 0, dim2 = 1, name = paste0(prefix, "swap_post"))
-    
-    if (!is.null(loss_output)) {
-      
-      mask <- mx.symbol.reshape(data = mask, shape = c(0, -1), reverse = TRUE)
-      label <- mx.symbol.reshape(data = label, shape = c(-1))
-      
-      decode <- mx.symbol.FullyConnected(data = mask, weight = cls.weight, bias = cls.bias, num_hidden = num_decode, 
-                                         flatten = T, name = paste0(prefix, "decode"))
-      
-      out <- switch(loss_output,
-                    softmax = mx.symbol.SoftmaxOutput(data=decode, label=label, use_ignore = !ignore_label == -1, ignore_label = ignore_label, 
-                                                      name = paste0(prefix, "loss")),
-                    linear = mx.symbol.LinearRegressionOutput(data=decode, label=label, name = paste0(prefix, "loss")),
-                    logistic = mx.symbol.LogisticRegressionOutput(data=decode, label=label, name = paste0(prefix, "loss")),
-                    MAE = mx.symbol.MAERegressionOutput(data=decode, label=label, name = paste0(prefix, "loss"))
-      )
-    } else out <- mask
-  }
-  
-  if (output_last_state) {
-    return(mx.symbol.Group(c(out, out.states)))
-  } else return(out)
-}
diff --git a/R-package/R/viz.graph.R b/R-package/R/viz.graph.R
index ab876afdfa1e..488ed24818a1 100644
--- a/R-package/R/viz.graph.R
+++ b/R-package/R/viz.graph.R
@@ -57,12 +57,7 @@ graph.viz <- function(symbol, shape=NULL, direction="TD", type="graph", graph.wi
       "Flatten" = ,
       "Reshape" = ,
       "Concat" = "#fdb462",
-      "LinearRegressionOutput"=,
-      "MAERegressionOutput"=,
-      "SVMOutput"=,
-      "LogisticRegressionOutput"=,
       "MakeLoss"=,
-      "SoftmaxOutput" = "#b3de69",
       "#fccde5" # default value
     )
   }
diff --git a/R-package/demo/00Index b/R-package/demo/00Index
index 1629d05f86f4..f467bbdb3880 100644
--- a/R-package/demo/00Index
+++ b/R-package/demo/00Index
@@ -1,7 +1,6 @@
 basic_bench                 Basic benchmark
 basic_executor              Basic executor operations
 basic_kvstore               Basic kvstore operations
-basic_model                 Basic model operations
 basic_ndarray               Basic ndarray operations
 basic_random                Basic random number generators
 basic_symbol                Basic symbol operations
diff --git a/R-package/demo/basic_model.R b/R-package/demo/basic_model.R
deleted file mode 100644
index 0ac028e472c2..000000000000
--- a/R-package/demo/basic_model.R
+++ /dev/null
@@ -1,127 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-list.of.packages <- c("R.utils")
-new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[, "Package"])]
-if( length(new.packages)) install.packages(new.packages, repos = "https://cloud.r-project.org/")
-
-setwd(tempdir())
-
-download.file("http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz", destfile="train-images-idx3-ubyte.gz")
-
-download.file("http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz", destfile="train-labels-idx1-ubyte.gz")
-
-download.file("http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz", destfile="t10k-images-idx3-ubyte.gz")
-
-download.file("http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz", destfile="t10k-labels-idx1-ubyte.gz")
-
-require(R.utils)
-
-gunzip("train-images-idx3-ubyte.gz")
-
-gunzip("train-labels-idx1-ubyte.gz")
-
-gunzip("t10k-images-idx3-ubyte.gz")
-
-gunzip("t10k-labels-idx1-ubyte.gz")
-
-require(mxnet)
-
-# Network configuration
-batch.size <- 100
-data <- mx.symbol.Variable("data")
-fc1 <- mx.symbol.FullyConnected(data, name = "fc1", num_hidden = 128)
-act1 <- mx.symbol.Activation(fc1, name = "relu1", act_type = "relu")
-fc2 <- mx.symbol.FullyConnected(act1, name = "fc2", num_hidden = 64)
-act2 <- mx.symbol.Activation(fc2, name = "relu2", act_type = "relu")
-fc3 <- mx.symbol.FullyConnected(act2, name = "fc3", num_hidden = 10)
-softmax <- mx.symbol.Softmax(fc3, name = "sm")
-
-dtrain <- mx.io.MNISTIter(
-  image = "train-images-idx3-ubyte",
-  label = "train-labels-idx1-ubyte",
-  data.shape = c(784),
-  batch.size = batch.size,
-  shuffle = TRUE,
-  flat = TRUE,
-  silent = 0,
-  seed = 10)
-
-dtest = mx.io.MNISTIter(
-  image="t10k-images-idx3-ubyte",
-  label="t10k-labels-idx1-ubyte",
-  data.shape=c(784),
-  batch.size=batch.size,
-  shuffle=FALSE,
-  flat=TRUE,
-  silent=0)
-
-mx.set.seed(0)
-devices = lapply(1:2, function(i) {
-  mx.cpu(i)
-})
-
-# create the model
-model <- mx.model.FeedForward.create(softmax, X=dtrain, eval.data=dtest,
-                                     ctx=devices, num.round=1,
-                                     learning.rate=0.1, momentum=0.9,
-                                     initializer=mx.init.uniform(0.07),
-                                     epoch.end.callback=mx.callback.save.checkpoint("chkpt"),
-                                     batch.end.callback=mx.callback.log.train.metric(100))
-
-# do prediction
-pred <- predict(model, dtest)
-label <- mx.io.extract(dtest, "label")
-dataX <- mx.io.extract(dtest, "data")
-# Predict with R's array
-pred2 <- predict(model, X = dataX)
-
-accuracy <- function(label, pred) {
-  ypred = max.col(t(as.array(pred)))
-  return(sum((as.array(label) + 1) == ypred) / length(label))
-}
-
-print(paste0("Finish prediction... accuracy = ", accuracy(label, pred)))
-print(paste0("Finish prediction... accuracy2 = ", accuracy(label, pred2)))
-
-
-
-# load the model
-model <- mx.model.load("chkpt", 1)
-
-#continue training with some new arguments
-model <- mx.model.FeedForward.create(model$symbol, X = dtrain, eval.data = dtest,
-                                     ctx = devices, num.round = 5,
-                                     learning.rate = 0.1, momentum = 0.9,
-                                     epoch.end.callback = mx.callback.save.checkpoint("reload_chkpt"),
-                                     batch.end.callback = mx.callback.log.train.metric(100),
-                                     arg.params = model$arg.params, aux.params = model$aux.params)
-
-# do prediction
-pred <- predict(model, dtest)
-label <- mx.io.extract(dtest, "label")
-dataX <- mx.io.extract(dtest, "data")
-# Predict with R's array
-pred2 <- predict(model, X = dataX)
-
-accuracy <- function(label, pred) {
-  ypred <- max.col(t(as.array(pred)))
-  return(sum((as.array(label) + 1) == ypred) / length(label))
-}
-
-print(paste0("Finish prediction... accuracy=", accuracy(label, pred)))
-print(paste0("Finish prediction... accuracy2=", accuracy(label, pred2)))
diff --git a/R-package/tests/testthat/test_img_seg.R b/R-package/tests/testthat/test_img_seg.R
deleted file mode 100644
index 4af7d62cf533..000000000000
--- a/R-package/tests/testthat/test_img_seg.R
+++ /dev/null
@@ -1,165 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-require(mxnet)
-
-source("get_data.R")
-
-if (Sys.getenv("R_GPU_ENABLE") != "" & as.integer(Sys.getenv("R_GPU_ENABLE")) == 
-  1) {
-  mx.ctx.default(new = mx.gpu())
-  message("Using GPU for testing.")
-}
-
-print_inferred_shape <- function(net) {
-  slist <- mx.symbol.infer.shape(symbol = net, data = c(168, 168, 1, 2))
-  print(slist$out.shapes)
-}
-
-convolution_module <- function(net, kernel_size, pad_size, filter_count, stride = c(1, 
-  1), work_space = 2048, batch_norm = TRUE, down_pool = FALSE, up_pool = FALSE, 
-  act_type = "relu", convolution = TRUE) {
-  if (up_pool) {
-    net <- mx.symbol.Deconvolution(net, kernel = c(2, 2), pad = c(0, 0), stride = c(2, 
-      2), num_filter = filter_count, workspace = work_space)
-    net <- mx.symbol.BatchNorm(net)
-    if (act_type != "") {
-      net <- mx.symbol.Activation(net, act_type = act_type)
-    }
-  }
-  if (convolution) {
-    conv <- mx.symbol.Convolution(data = net, kernel = kernel_size, stride = stride, 
-      pad = pad_size, num_filter = filter_count, workspace = work_space)
-    net <- conv
-  }
-  if (batch_norm) {
-    net <- mx.symbol.BatchNorm(net)
-  }
-  
-  if (act_type != "") {
-    net <- mx.symbol.Activation(net, act_type = act_type)
-  }
-  
-  if (down_pool) {
-    pool <- mx.symbol.Pooling(net, pool_type = "max", kernel = c(2, 2), stride = c(2, 
-      2))
-    net <- pool
-  }
-  print_inferred_shape(net)
-  return(net)
-}
-
-get_unet <- function() {
-  data <- mx.symbol.Variable("data")
-  kernel_size <- c(3, 3)
-  pad_size <- c(1, 1)
-  filter_count <- 32
-  pool1 <- convolution_module(data, kernel_size, pad_size, filter_count = filter_count, 
-    down_pool = TRUE)
-  net <- pool1
-  pool2 <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
-    2, down_pool = TRUE)
-  net <- pool2
-  pool3 <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
-    4, down_pool = TRUE)
-  net <- pool3
-  pool4 <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
-    4, down_pool = TRUE)
-  net <- pool4
-  net <- mx.symbol.Dropout(net)
-  pool5 <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
-    8, down_pool = TRUE)
-  net <- pool5
-  net <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
-    4, up_pool = TRUE)
-  net <- convolution_module(net, kernel_size, pad_size = c(2, 2), filter_count = filter_count * 
-    4, up_pool = TRUE)
-  net <- mx.symbol.Crop(net, pool3, num.args = 2)
-  net <- mx.symbol.concat(c(pool3, net), num.args = 2)
-  net <- mx.symbol.Dropout(net)
-  net <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
-    4)
-  net <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
-    4, up_pool = TRUE)
-  
-  net <- mx.symbol.concat(c(pool2, net), num.args = 2)
-  net <- mx.symbol.Dropout(net)
-  net <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
-    4)
-  net <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
-    4, up_pool = TRUE)
-  convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
-    4)
-  net <- mx.symbol.concat(c(pool1, net), num.args = 2)
-  net <- mx.symbol.Dropout(net)
-  net <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
-    2)
-  net <- convolution_module(net, kernel_size, pad_size, filter_count = filter_count * 
-    2, up_pool = TRUE)
-  net <- convolution_module(net, kernel_size, pad_size, filter_count = 1, batch_norm = FALSE, 
-    act_type = "")
-  net <- mx.symbol.SoftmaxOutput(data = net, name = "sm")
-  return(net)
-}
-
-context("Image segmentation")
-
-test_that("UNET", {
-  list.of.packages <- c("imager")
-  new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[, 
-    "Package"])]
-  if (length(new.packages)) 
-    install.packages(new.packages, repos = "https://cloud.r-project.org/")
-  GetISBI_data()
-  library(imager)
-  IMG_SIZE <- 168
-  files <- list.files(path = "data/ISBI/train-volume/")
-  a <- "data/ISBI/train-volume/"
-  filess <- paste(a, files, sep = "")
-  list_of_images <- lapply(filess, function(x) {
-    x <- load.image(x)
-    y <- resize(x, size_x = IMG_SIZE, size_y = IMG_SIZE)
-  })
-  
-  train.x <- do.call("cbind", lapply(list_of_images, as.vector))
-  train.array <- train.x
-  dim(train.array) <- c(IMG_SIZE, IMG_SIZE, 1, 30)
-  
-  files <- list.files(path = "data/ISBI/train-labels")
-  b <- "data/ISBI/train-labels/"
-  filess <- paste(b, files, sep = "")
-  list_of_images <- lapply(filess, function(x) {
-    x <- load.image(x)
-    y <- resize(x, size_x = IMG_SIZE, size_y = IMG_SIZE)
-  })
-  
-  train.y <- do.call("cbind", lapply(list_of_images, as.vector))
-  
-  train.y[which(train.y < 0.5)] <- 0
-  train.y[which(train.y > 0.5)] <- 1
-  train.y.array <- train.y
-  dim(train.y.array) <- c(IMG_SIZE, IMG_SIZE, 1, 30)
-  
-  devices <- mx.ctx.default()
-  mx.set.seed(0)
-  
-  net <- get_unet()
-  
-  model <- mx.model.FeedForward.create(net, X = train.array, y = train.y.array, 
-    ctx = devices, num.round = 2, initializer = mx.init.normal(sqrt(2/576)), 
-    learning.rate = 0.05, momentum = 0.99, array.batch.size = 2)
-})
diff --git a/R-package/tests/testthat/test_model.R b/R-package/tests/testthat/test_model.R
deleted file mode 100644
index e62f334a4ab8..000000000000
--- a/R-package/tests/testthat/test_model.R
+++ /dev/null
@@ -1,290 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-require(mxnet)
-
-source("get_data.R")
-
-context("models")
-
-if (Sys.getenv("R_GPU_ENABLE") != "" & as.integer(Sys.getenv("R_GPU_ENABLE")) == 
-  1) {
-  mx.ctx.default(new = mx.gpu())
-  message("Using GPU for testing.")
-}
-
-test_that("MNIST", {
-  # # Network configuration
-  GetMNIST_ubyte()
-  batch.size <- 100
-  data <- mx.symbol.Variable("data")
-  fc1 <- mx.symbol.FullyConnected(data, name = "fc1", num_hidden = 128)
-  act1 <- mx.symbol.Activation(fc1, name = "relu1", act_type = "relu")
-  fc2 <- mx.symbol.FullyConnected(act1, name = "fc2", num_hidden = 64)
-  act2 <- mx.symbol.Activation(fc2, name = "relu2", act_type = "relu")
-  fc3 <- mx.symbol.FullyConnected(act2, name = "fc3", num_hidden = 10)
-  softmax <- mx.symbol.Softmax(fc3, name = "sm")
-  
-  dtrain <- mx.io.MNISTIter(image = "data/train-images-idx3-ubyte", label = "data/train-labels-idx1-ubyte", 
-    data.shape = c(784), batch.size = batch.size, shuffle = TRUE, flat = TRUE, 
-    silent = 0, seed = 10)
-  
-  dtest <- mx.io.MNISTIter(image = "data/t10k-images-idx3-ubyte", label = "data/t10k-labels-idx1-ubyte", 
-    data.shape = c(784), batch.size = batch.size, shuffle = FALSE, flat = TRUE, 
-    silent = 0)
-  
-  mx.set.seed(0)
-  
-  # create the model
-  model <- mx.model.FeedForward.create(softmax, X = dtrain, eval.data = dtest, 
-    ctx = mx.ctx.default(), num.round = 1, learning.rate = 0.1, momentum = 0.9, 
-    initializer = mx.init.uniform(0.07), epoch.end.callback = mx.callback.save.checkpoint("chkpt"), 
-    batch.end.callback = mx.callback.log.train.metric(100))
-  
-  # do prediction
-  pred <- predict(model, dtest)
-  label <- mx.io.extract(dtest, "label")
-  dataX <- mx.io.extract(dtest, "data")
-  # Predict with R's array
-  pred2 <- predict(model, X = dataX)
-  
-  accuracy <- function(label, pred) {
-    ypred <- max.col(t(as.array(pred)))
-    return(sum((as.array(label) + 1) == ypred)/length(label))
-  }
-  
-  expect_equal(accuracy(label, pred), accuracy(label, pred2), tolerance = 0.1)
-  
-  file.remove("chkpt-0001.params")
-  file.remove("chkpt-symbol.json")
-})
-
-test_that("Regression", {
-  data(BostonHousing, package = "mlbench")
-  train.ind <- seq(1, 506, 3)
-  train.x <- data.matrix(BostonHousing[train.ind, -14])
-  train.y <- BostonHousing[train.ind, 14]
-  test.x <- data.matrix(BostonHousing[-train.ind, -14])
-  test.y <- BostonHousing[-train.ind, 14]
-  data <- mx.symbol.Variable("data")
-  fc1 <- mx.symbol.FullyConnected(data, num_hidden = 1)
-  lro <- mx.symbol.LinearRegressionOutput(fc1)
-  
-  demo.metric.mae <- mx.metric.custom("mae", function(label, pred) {
-    pred <- mx.nd.reshape(pred, shape = 0)
-    res <- mx.nd.mean(mx.nd.abs(label - pred))
-    return(as.array(res))
-  })
-  mx.set.seed(0)
-  model <- mx.model.FeedForward.create(lro, X = train.x, y = train.y, ctx = mx.ctx.default(), 
-    num.round = 5, array.batch.size = 20, learning.rate = 2e-06, momentum = 0.9, 
-    eval.metric = demo.metric.mae)
-  
-  train.x <- data.matrix(BostonHousing[train.ind, -(13:14)])
-  train.y <- BostonHousing[train.ind, c(13:14)]
-  test.x <- data.matrix(BostonHousing[-train.ind, -(13:14)])
-  test.y <- BostonHousing[-train.ind, c(13:14)]
-  
-  data <- mx.symbol.Variable("data")
-  fc2 <- mx.symbol.FullyConnected(data, num_hidden = 2)
-  lro2 <- mx.symbol.LinearRegressionOutput(fc2)
-  
-  mx.set.seed(0)
-  train_iter <- mx.io.arrayiter(data = t(train.x), label = t(train.y))
-  
-  model <- mx.model.FeedForward.create(lro2, X = train_iter, ctx = mx.ctx.default(), 
-    num.round = 50, array.batch.size = 20, learning.rate = 2e-06, momentum = 0.9)
-})
-
-
-test_that("Classification", {
-  data(Sonar, package = "mlbench")
-  Sonar[, 61] <- as.numeric(Sonar[, 61]) - 1
-  train.ind <- c(1:50, 100:150)
-  train.x <- data.matrix(Sonar[train.ind, 1:60])
-  train.y <- Sonar[train.ind, 61]
-  test.x <- data.matrix(Sonar[-train.ind, 1:60])
-  test.y <- Sonar[-train.ind, 61]
-  mx.set.seed(0)
-  model <- mx.mlp(train.x, train.y, hidden_node = 10, out_node = 2, out_activation = "softmax", 
-    num.round = 5, array.batch.size = 15, learning.rate = 0.07, momentum = 0.9, 
-    eval.metric = mx.metric.accuracy)
-})
-
-test_that("Fine-tune", {
-  GetInception()
-  GetCatDog()
-  train_iter <- mx.io.ImageRecordIter(path.imgrec = "./data/cats_dogs/cats_dogs_train.rec", 
-    batch.size = 8, data.shape = c(224, 224, 3), rand.crop = TRUE, rand.mirror = TRUE)
-  val_iter <- mx.io.ImageRecordIter(path.imgrec = "./data/cats_dogs/cats_dogs_val.rec", 
-    batch.size = 8, data.shape = c(224, 224, 3), rand.crop = FALSE, rand.mirror = FALSE)
-  inception_bn <- mx.model.load("./model/Inception-BN", iteration = 126)
-  symbol <- inception_bn$symbol
-  internals <- symbol$get.internals()
-  outputs <- internals$outputs
-  
-  flatten <- internals$get.output(which(outputs == "flatten_output"))
-  
-  new_fc <- mx.symbol.FullyConnected(data = flatten, num_hidden = 2, name = "fc1")
-  new_soft <- mx.symbol.SoftmaxOutput(data = new_fc, name = "softmax")
-  arg_params_new <- mx.model.init.params(symbol = new_soft, input.shape = list(data = c(224, 
-    224, 3, 8)), output.shape = NULL, initializer = mx.init.uniform(0.1), ctx = mx.cpu())$arg.params
-  fc1_weights_new <- arg_params_new[["fc1_weight"]]
-  fc1_bias_new <- arg_params_new[["fc1_bias"]]
-  
-  arg_params_new <- inception_bn$arg.params
-  
-  arg_params_new[["fc1_weight"]] <- fc1_weights_new
-  arg_params_new[["fc1_bias"]] <- fc1_bias_new
-  
-  # model <- mx.model.FeedForward.create(symbol = new_soft, X = train_iter,
-  # eval.data = val_iter, ctx = mx.ctx.default(), eval.metric = mx.metric.accuracy,
-  # num.round = 2, learning.rate = 0.05, momentum = 0.9, wd = 0.00001, kvstore =
-  # 'local', batch.end.callback = mx.callback.log.train.metric(50), initializer =
-  # mx.init.Xavier(factor_type = 'in', magnitude = 2.34), optimizer = 'sgd',
-  # arg.params = arg_params_new, aux.params = inception_bn$aux.params)
-})
-
-test_that("Matrix Factorization", {
-  
-  # Use fake random data instead of GetMovieLens() to remove external dependency
-  set.seed(123)
-  user <- sample(943, size = 1e+05, replace = T)
-  item <- sample(1682, size = 1e+05, replace = T)
-  score <- sample(5, size = 1e+05, replace = T)
-  DF <- data.frame(user, item, score)
-  
-  max_user <- max(DF$user)
-  max_item <- max(DF$item)
-  DF_mat_x <- data.matrix(t(DF[, 1:2]))
-  DF_y <- DF[, 3]
-  k <- 64
-  user <- mx.symbol.Variable("user")
-  item <- mx.symbol.Variable("item")
-  score <- mx.symbol.Variable("score")
-  user1 <- mx.symbol.Embedding(data = mx.symbol.BlockGrad(user), input_dim = max_user, 
-    output_dim = k, name = "user1")
-  item1 <- mx.symbol.Embedding(data = mx.symbol.BlockGrad(item), input_dim = max_item, 
-    output_dim = k, name = "item1")
-  pred <- user1 * item1
-  pred1 <- mx.symbol.sum_axis(pred, axis = 1, name = "pred1")
-  pred2 <- mx.symbol.Flatten(pred1, name = "pred2")
-  pred3 <- mx.symbol.LinearRegressionOutput(data = pred2, label = score, name = "pred3")
-  
-  mx.set.seed(123)
-  
-  CustomIter <- setRefClass("CustomIter", fields = c("iter1", "iter2"), contains = "Rcpp_MXArrayDataIter", 
-    methods = list(initialize = function(iter1, iter2) {
-      .self$iter1 <- iter1
-      .self$iter2 <- iter2
-      .self
-    }, value = function() {
-      user <- .self$iter1$value()$data
-      item <- .self$iter2$value()$data
-      score <- .self$iter1$value()$label
-      list(user = user, item = item, score = score)
-    }, iter.next = function() {
-      .self$iter1$iter.next()
-      .self$iter2$iter.next()
-    }, reset = function() {
-      .self$iter1$reset()
-      .self$iter2$reset()
-    }, num.pad = function() {
-      .self$iter1$num.pad()
-    }, finalize = function() {
-      .self$iter1$finalize()
-      .self$iter2$finalize()
-    }))
-  
-  user_iter <- mx.io.arrayiter(data = DF[, 1], label = DF[, 3], batch.size = k)
-  
-  item_iter <- mx.io.arrayiter(data = DF[, 2], label = DF[, 3], batch.size = k)
-  
-  train_iter <- CustomIter$new(user_iter, item_iter)
-  
-  model <- mx.model.FeedForward.create(pred3, X = train_iter, ctx = mx.ctx.default(), 
-    num.round = 5, initializer = mx.init.uniform(0.07), learning.rate = 0.07, 
-    eval.metric = mx.metric.rmse, momentum = 0.9, epoch.end.callback = mx.callback.log.train.metric(1), 
-    input.names = c("user", "item"), output.names = "score")
-})
-
-test_that("Captcha", {
-  GetCaptcha_data()
-  data.shape <- c(80, 30, 3)
-  batch_size <- 40
-  train <- mx.io.ImageRecordIter(path.imgrec = "./data/captcha_example/captcha_train.rec", 
-    path.imglist = "./data/captcha_example/captcha_train.lst", batch.size = batch_size, 
-    label.width = 4, data.shape = data.shape, mean.img = "mean.bin")
-  
-  val <- mx.io.ImageRecordIter(path.imgrec = "./data/captcha_example/captcha_test.rec", 
-    path.imglist = "./data/captcha_example/captcha_test.lst", batch.size = batch_size, 
-    label.width = 4, data.shape = data.shape, mean.img = "mean.bin")
-  
-  data <- mx.symbol.Variable("data")
-  label <- mx.symbol.Variable("label")
-  conv1 <- mx.symbol.Convolution(data = data, kernel = c(5, 5), num_filter = 32)
-  pool1 <- mx.symbol.Pooling(data = conv1, pool_type = "max", kernel = c(2, 2), 
-    stride = c(1, 1))
-  relu1 <- mx.symbol.Activation(data = pool1, act_type = "relu")
-  
-  conv2 <- mx.symbol.Convolution(data = relu1, kernel = c(5, 5), num_filter = 32)
-  pool2 <- mx.symbol.Pooling(data = conv2, pool_type = "avg", kernel = c(2, 2), 
-    stride = c(1, 1))
-  relu2 <- mx.symbol.Activation(data = pool2, act_type = "relu")
-  
-  flatten <- mx.symbol.Flatten(data = relu2)
-  fc1 <- mx.symbol.FullyConnected(data = flatten, num_hidden = 120)
-  fc21 <- mx.symbol.FullyConnected(data = fc1, num_hidden = 10)
-  fc22 <- mx.symbol.FullyConnected(data = fc1, num_hidden = 10)
-  fc23 <- mx.symbol.FullyConnected(data = fc1, num_hidden = 10)
-  fc24 <- mx.symbol.FullyConnected(data = fc1, num_hidden = 10)
-  fc2 <- mx.symbol.concat(c(fc21, fc22, fc23, fc24), dim = 0, num.args = 4)
-  label <- mx.symbol.transpose(data = label)
-  label <- mx.symbol.Reshape(data = label, target_shape = c(0))
-  captcha_net <- mx.symbol.SoftmaxOutput(data = fc2, label = label, name = "softmax")
-  
-  mx.metric.acc2 <- mx.metric.custom("accuracy", function(label, pred) {
-    label <- as.array(label)
-    pred <- as.array(pred)
-    ypred <- max.col(t(pred)) - 1
-    ypred <- matrix(ypred, nrow = nrow(label), ncol = ncol(label), byrow = TRUE)
-    return(sum(colSums(label == ypred) == 4)/ncol(label))
-  })
-  
-  mx.set.seed(42)
-  
-  train$reset()
-  train$iter.next()
-  
-  input.names <- "data"
-  input.shape <- sapply(input.names, function(n) {
-    dim(train$value()[[n]])
-  }, simplify = FALSE)
-  arg_names <- arguments(captcha_net)
-  output.names <- "label"
-  output.shape <- sapply(output.names, function(n) {
-    dim(train$value()[[n]])
-  }, simplify = FALSE)
-  params <- mx.model.init.params(captcha_net, input.shape, output.shape, mx.init.Xavier(factor_type = "in", 
-    magnitude = 2.34), mx.cpu())
-  
-  # model <- mx.model.FeedForward.create( X = train, eval.data = val, ctx =
-  # mx.ctx.default(), symbol = captcha_net, eval.metric = mx.metric.acc2, num.round
-  # = 1, learning.rate = 1e-04, momentum = 0.9, wd = 1e-05, batch.end.callback =
-  # mx.callback.log.train.metric(50), initializer = mx.init.Xavier(factor_type =
-  # 'in', magnitude = 2.34), optimizer = 'sgd', clip_gradient = 10)
-})
diff --git a/R-package/tests/testthat/test_optimizer.R b/R-package/tests/testthat/test_optimizer.R
deleted file mode 100644
index cbe9575c90ca..000000000000
--- a/R-package/tests/testthat/test_optimizer.R
+++ /dev/null
@@ -1,251 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-context("optimizer")
-
-if (Sys.getenv("R_GPU_ENABLE") != "" & as.integer(Sys.getenv("R_GPU_ENABLE")) == 
-		1) {
-	mx.ctx.default(new = mx.gpu())
-	message("Using GPU for testing.")
-}
-
-test_that("sgd", {
-  
-  data <- mx.symbol.Variable("data")
-  label <- mx.symbol.Variable("label")
-  fc_weight <- mx.symbol.Variable("fc_weight")
-  fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, 
-    name = "fc1", num_hidden = 1)
-  loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")
-  
-  x <- mx.nd.array(array(1:6, dim = 2:3))
-  y <- mx.nd.array(c(5, 11, 16))
-  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))
-  
-  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x, 
-    fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", 
-    "null"))
-  
-  optimizer <- mx.opt.create("sgd", learning.rate = 1, momentum = 0, wd = 0, rescale.grad = 1, 
-    clip_gradient = -1)
-  
-  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())
-  
-  mx.exec.forward(exec, is.train = T)
-  mx.exec.backward(exec)
-  
-  arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
-  mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
-  
-  expect_equal(as.array(arg.blocks[[2]]), array(c(1.4, 2.6), dim = c(2, 1)), tolerance = 0.1)
-  
-})
-
-
-test_that("rmsprop", {
-  
-  data <- mx.symbol.Variable("data")
-  label <- mx.symbol.Variable("label")
-  fc_weight <- mx.symbol.Variable("fc_weight")
-  fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, 
-    name = "fc1", num_hidden = 1)
-  loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")
-  
-  x <- mx.nd.array(array(1:6, dim = 2:3))
-  y <- mx.nd.array(c(5, 11, 16))
-  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))
-  
-  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x, 
-    fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", 
-    "null"))
-  
-  optimizer <- mx.opt.create("rmsprop", learning.rate = 1, centered = TRUE, rho = 0.95,
-    momentum = 0.9, epsilon = 1e-04, wd = 0, rescale.grad = 1, clip_gradient = -1)
-  
-  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())
-  
-  mx.exec.forward(exec, is.train = T)
-  mx.exec.backward(exec)
-  
-  arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
-  mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
-  
-  expect_equal(as.array(arg.blocks[[2]]), array(c(5.64, 6.38), dim = c(2, 1)), 
-    tolerance = 0.1)
-  
-})
-
-
-test_that("adam", {
-  
-  data <- mx.symbol.Variable("data")
-  label <- mx.symbol.Variable("label")
-  fc_weight <- mx.symbol.Variable("fc_weight")
-  fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, 
-    name = "fc1", num_hidden = 1)
-  loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")
-  
-  x <- mx.nd.array(array(1:6, dim = 2:3))
-  y <- mx.nd.array(c(5, 11, 16))
-  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))
-  
-  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x, 
-    fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", 
-    "null"))
-  
-  optimizer <- mx.opt.create("adam", learning.rate = 1, beta1 = 0.9, beta2 = 0.999, 
-    epsilon = 1e-08, wd = 0, rescale.grad = 1, clip_gradient = -1)
-  
-  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())
-  
-  mx.exec.forward(exec, is.train = T)
-  mx.exec.backward(exec)
-  
-  arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
-  mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
-  
-  expect_equal(as.array(arg.blocks[[2]]), array(c(4.26, 4.96), dim = c(2, 1)), 
-    tolerance = 0.1)
-  
-})
-
-
-test_that("adagrad", {
-  
-  data <- mx.symbol.Variable("data")
-  label <- mx.symbol.Variable("label")
-  fc_weight <- mx.symbol.Variable("fc_weight")
-  fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, 
-    name = "fc1", num_hidden = 1)
-  loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")
-  
-  x <- mx.nd.array(array(1:6, dim = 2:3))
-  y <- mx.nd.array(c(5, 11, 16))
-  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))
-  
-  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x, 
-    fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", 
-    "null"))
-  
-  optimizer <- mx.opt.create("adagrad", learning.rate = 1, epsilon = 1e-08, wd = 0, 
-    rescale.grad = 1, clip_gradient = -1)
-  
-  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())
-  
-  mx.exec.forward(exec, is.train = T)
-  mx.exec.backward(exec)
-  
-  arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
-  mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
-  
-  expect_equal(as.array(arg.blocks[[2]]), array(c(2.1, 2.8), dim = c(2, 1)), tolerance = 0.1)
-  
-})
-
-
-test_that("adadelta", {
-  
-  data <- mx.symbol.Variable("data")
-  label <- mx.symbol.Variable("label")
-  fc_weight <- mx.symbol.Variable("fc_weight")
-  fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T, 
-    name = "fc1", num_hidden = 1)
-  loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")
-  
-  x <- mx.nd.array(array(1:6, dim = 2:3))
-  y <- mx.nd.array(c(5, 11, 16))
-  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))
-  
-  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x, 
-    fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", 
-    "null"))
-  
-  optimizer <- mx.opt.create("adadelta", rho = 0.9, epsilon = 1e-05, wd = 0, rescale.grad = 1, 
-    clip_gradient = -1)
-  
-  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())
-  
-  mx.exec.forward(exec, is.train = T)
-  mx.exec.backward(exec)
-  
-  arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
-  mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
-
-  expect_equal(as.array(arg.blocks[[2]]), array(c(1.11, 1.81), dim = c(2, 1)), 
-    tolerance = 0.1)
-  
-})
-
-
-test_that("nag_no_momentum", {
-  data <- mx.symbol.Variable("data")
-  label <- mx.symbol.Variable("label")
-  fc_weight <- mx.symbol.Variable("fc_weight")
-  fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
-	name = "fc1", num_hidden = 1)
-  loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")
-
-  x <- mx.nd.array(array(1:6, dim = 2:3))
-	y <- mx.nd.array(c(5, 11, 16))
-	w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))
-
-	exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x,
-    fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", "null"))
-
-  optimizer <- mx.opt.create("nag", learning.rate = 1, momentum = 0, wd = 0, rescale.grad = 1,
-	  clip_gradient = -1)
-
-	updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())
-	
-  mx.exec.forward(exec, is.train = T)
-	mx.exec.backward(exec)
-		
-  arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
-	mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
-		
-  expect_equal(as.array(arg.blocks[[2]]), array(c(1.4, 2.6), dim = c(2, 1)), tolerance = 0.05)
-})
-
-
-test_that("nag_momentum", {
-  data <- mx.symbol.Variable("data")
-  label <- mx.symbol.Variable("label")
-  fc_weight <- mx.symbol.Variable("fc_weight")
-  fc <- mx.symbol.FullyConnected(data = data, weight = fc_weight, no.bias = T,
-                                 name = "fc1", num_hidden = 1)
-  loss <- mx.symbol.LinearRegressionOutput(data = fc, label = label, name = "loss")
-  
-  x <- mx.nd.array(array(1:6, dim = 2:3))
-  y <- mx.nd.array(c(5, 11, 16))
-  w1 <- mx.nd.array(array(c(1.1, 1.8), dim = c(2, 1)))
-  
-  exec <- mxnet:::mx.symbol.bind(symbol = loss, ctx = mx.ctx.default(), arg.arrays = list(data = x,
-                                                                                          fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", "null"))
-  
-  optimizer <- mx.opt.create("nag", learning.rate = 1, momentum = 0.1, wd = 0, rescale.grad = 1,
-                             clip_gradient = 5)
-  
-  updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())
-  
-  mx.exec.forward(exec, is.train = T)
-  mx.exec.backward(exec)
-  
-  arg.blocks <- updaters(exec$ref.arg.arrays, exec$ref.grad.arrays)
-  mx.exec.update.arg.arrays(exec, arg.blocks, skip.null = TRUE)
-  
-  expect_equal(as.array(arg.blocks[[2]]), array(c(1.45, 2.65), dim = c(2, 1)), tolerance = 0.1)
-})
diff --git a/R-package/tests/testthat/test_symbol.R b/R-package/tests/testthat/test_symbol.R
index c93118a7db1f..acad98ac7b1f 100644
--- a/R-package/tests/testthat/test_symbol.R
+++ b/R-package/tests/testthat/test_symbol.R
@@ -91,17 +91,6 @@ test_that("symbol infer type", {
   expect_equal(ret, NULL)
 })
 
-test_that("symbol save/load", {
-  data <- mx.symbol.Variable("data")
-  fc1 <- mx.symbol.FullyConnected(data, num_hidden = 1)
-  lro <- mx.symbol.LinearRegressionOutput(fc1)
-  mx.symbol.save(lro, "tmp_r_sym.json")
-  data2 <- mx.symbol.load("tmp_r_sym.json")
-  
-  expect_equal(data2$as.json(), lro$as.json())
-  file.remove("tmp_r_sym.json")
-})
-
 test_that("symbol attributes access", {
   str <- "(1, 1, 1, 1)"
   x <- mx.symbol.Variable("x")
diff --git a/R-package/vignettes/CallbackFunction.Rmd b/R-package/vignettes/CallbackFunction.Rmd
deleted file mode 100644
index 12b7e28247e9..000000000000
--- a/R-package/vignettes/CallbackFunction.Rmd
+++ /dev/null
@@ -1,160 +0,0 @@
-# Customized callback function
-
-This vignette gives users a guideline for using and writing callback functions,
-which can be very useful in model training. 
-
-## Model training example
-
-Let's begin from a small example. We can build and train a model using the following code:
-
-```{r}
-library(mxnet)
-data(BostonHousing, package="mlbench")
-train.ind = seq(1, 506, 3)
-train.x = data.matrix(BostonHousing[train.ind, -14])
-train.y = BostonHousing[train.ind, 14]
-test.x = data.matrix(BostonHousing[-train.ind, -14])
-test.y = BostonHousing[-train.ind, 14]
-data <- mx.symbol.Variable("data")
-fc1 <- mx.symbol.FullyConnected(data, num_hidden=1)
-lro <- mx.symbol.LinearRegressionOutput(fc1)
-mx.set.seed(0)
-model <- mx.model.FeedForward.create(
-  lro, X=train.x, y=train.y,
-  eval.data=list(data=test.x, label=test.y),
-  ctx=mx.cpu(), num.round=10, array.batch.size=20,
-  learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse)
-```
-
-Besides, we provide two optional parameters, `batch.end.callback` and `epoch.end.callback`, which can provide great flexibility in model training.
-
-## How to use callback functions
-
-
-Two callback functions are provided in this package:
-
-- `mx.callback.save.checkpoint` is used to save checkpoint to files each period iteration.
-
-```{r}
-model <- mx.model.FeedForward.create(
-  lro, X=train.x, y=train.y,
-  eval.data=list(data=test.x, label=test.y),
-  ctx=mx.cpu(), num.round=10, array.batch.size=20,
-  learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse,
-  epoch.end.callback = mx.callback.save.checkpoint("boston"))
-list.files(pattern = "^boston")
-```
-
-
-- `mx.callback.log.train.metric` is used to log training metric each period.
-You can use it either as a `batch.end.callback` or a `epoch.end.callback`.
-
-```{r}
-model <- mx.model.FeedForward.create(
-  lro, X=train.x, y=train.y,
-  eval.data=list(data=test.x, label=test.y),
-  ctx=mx.cpu(), num.round=10, array.batch.size=20,
-  learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse,
-  batch.end.callback = mx.callback.log.train.metric(5))
-```
-
-You can also save the training and evaluation errors for later usage by passing a reference class.
-
-```{r}
-logger <- mx.metric.logger$new()
-model <- mx.model.FeedForward.create(
-  lro, X=train.x, y=train.y,
-  eval.data=list(data=test.x, label=test.y),
-  ctx=mx.cpu(), num.round=10, array.batch.size=20,
-  learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse,
-  epoch.end.callback = mx.callback.log.train.metric(5, logger))
-head(logger$train)
-head(logger$eval)
-```
-
-## How to write your own callback functions
-
-
-You can find the source code for two callback functions from [here](https://github.com/dmlc/mxnet/blob/master/R-package/R/callback.R) and they can be used as your template:
-
-Basically, all callback functions follow the structure below:
-
-```{r, eval=FALSE}
-mx.callback.fun <- function() {
-  function(iteration, nbatch, env, verbose) {
-  }
-}
-```
-
-The `mx.callback.save.checkpoint` function below is stateless. It just get the model from environment and save it.
-
-```{r, eval=FALSE}
-mx.callback.save.checkpoint <- function(prefix, period=1) {
-  function(iteration, nbatch, env, verbose=TRUE) {
-    if (iteration %% period == 0) {
-      mx.model.save(env$model, prefix, iteration)
-      if(verbose) message(sprintf("Model checkpoint saved to %s-%04d.params\n", prefix, iteration))
-    }
-    return(TRUE)
-  }
-}
-```
-
-The `mx.callback.log.train.metric` is a little more complex. It holds a reference class and update it during the training process.
-
-```{r, eval=FALSE}
-mx.callback.log.train.metric <- function(period, logger=NULL) {
-  function(iteration, nbatch, env, verbose=TRUE) {
-    if (nbatch %% period == 0 && !is.null(env$metric)) {
-      result <- env$metric$get(env$train.metric)
-      if (nbatch != 0 & verbose)
-        message(paste0("Batch [", nbatch, "] Train-", result$name, "=", result$value))
-      if (!is.null(logger)) {
-        if (class(logger) != "mx.metric.logger") {
-          stop("Invalid mx.metric.logger.")
-        }
-        logger$train <- c(logger$train, result$value)
-        if (!is.null(env$eval.metric)) {
-          result <- env$metric$get(env$eval.metric)
-          if (nbatch != 0 & verbose)
-            message(paste0("Batch [", nbatch, "] Validation-", result$name, "=", result$value))
-          logger$eval <- c(logger$eval, result$value)
-        }
-      }
-    }
-    return(TRUE)
-  }
-}
-```
-
-Now you might be curious why both callback functions `return(TRUE)`.
-Can we `return(FALSE)`?
-
-Yes! You can stop the training early by `return(FALSE)`. See the examples below.
-
-```{r}
-mx.callback.early.stop <- function(eval.metric) {
-  function(iteration, nbatch, env, verbose) {
-    if (!is.null(env$metric)) {
-      if (!is.null(eval.metric)) {
-        result <- env$metric$get(env$eval.metric)
-        if (result$value < eval.metric) {
-          return(FALSE)
-        }
-      }
-    }
-    return(TRUE)
-  }
-}
-model <- mx.model.FeedForward.create(
-  lro, X=train.x, y=train.y,
-  eval.data=list(data=test.x, label=test.y),
-  ctx=mx.cpu(), num.round=10, array.batch.size=20,
-  learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse,
-  epoch.end.callback = mx.callback.early.stop(10))
-```
-
-You can see once the validation metric goes below the threshold we set, the training process will stop early.
-
-
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/R-package/vignettes/CatsDogsFinetune.Rmd b/R-package/vignettes/CatsDogsFinetune.Rmd
deleted file mode 100644
index 726bb1a43c77..000000000000
--- a/R-package/vignettes/CatsDogsFinetune.Rmd
+++ /dev/null
@@ -1,272 +0,0 @@
-# Dogs vs. Cats classification with mxnet and R
-
-## Packages and prerequisites
-
-In this tutorial, we mainly use the following three packages:
-
-* `mxnet`: model training
-* `imager`: image processing
-* `abind`: manipulations with arrays.
-
-It is an end-to-end R solution for the dogs vs cats Kaggle competition (https://www.kaggle.com/c/dogs-vs-cats-redux-kernels-edition/)
-and it can be used as an example for fine-tuning.
-All the code has been test on Ubuntu 16.04.
-
-```{r, echo=FALSE}
-knitr::opts_chunk$set(eval = FALSE)
-```
-
-
-```{r}
-library(imager)
-library(mxnet)
-library(abind)
-```
-
-
-## Image processing
-
-### Renaming train files
-
-```{r}
-files <- list.files("./train/")
-old_names <- sapply(files, strsplit, split = ".", fixed = TRUE)
-max_length <- max(sapply(old_names, function(x) nchar(x[[2]])))
-zeros <- max_length - sapply(old_names, function(x) nchar(x[[2]]))
-zeros <- sapply(zeros, function(x) paste(rep(0, x), collapse = ""))
-new_names <- Map(function(x, y) {paste0("./train/", x[1], "/", y, x[2], ".jpg")},
-                 x = old_names, y = zeros)
-
-# Full names
-files <- paste0("./train/", files)
-
-dir.create("./train/cat")
-dir.create("./train/dog")
-
-# New names will be in 00001.jpg format
-Map(function(x, y) file.rename(from = x, to = y), files, new_names)
-```
-
-### Training images: 224x224, padded with empty space
-
-```{r}
-files <- list.files("./train/", recursive = TRUE)
-new_names <- paste0("./train_pad_224x224/", files)
-files <- paste0("./train/", files)
-dir.create("./train_pad_224x224/")
-dir.create("./train_pad_224x224/cat")
-dir.create("./train_pad_224x224/dog")
-
-padImage <- function(x) {
-  long_side <- max(dim(x)[1:2])
-  short_side <- min(dim(x)[1:2])
-  pad_img <- pad(x,
-  nPix = long_side - short_side,
-  axes = ifelse(dim(x)[1] < dim(x)[2], "x", "y"))
-  return(pad_img)
-}
-
-Map(function(x, y) {
-  pad_img <- padImage(load.image(x))
-  res_img <- resize(pad_img,  size_x = 224, size_y = 224)
-  imager::save.image(res_img, y)
-  }, x = files, y = new_names)
-```
-
-### Renaming test files
-
-```{r}
-files <- list.files("./test/")
-max_length <- max(sapply(files, nchar))
-zeros <- max_length - sapply(files, nchar)
-zeros <- sapply(zeros, function(x) paste(rep(0, x), collapse = ""))
-newnames <- paste0("./test/", zeros, files)
-
-files <- paste0("./test/", files)
-
-Map(function(x, y) file.rename(from = x, to = y), files, newnames)
-```
-
-
-### Test images: 224x224, padded with empty space
-
-```{r}
-files <- list.files("./test/")
-new_names <- paste0("./test_pad_224x224/", files)
-files <- paste0("./test/", files)
-dir.create("./test_pad_224x224/")
-
-Map(function(x, y) {
-  pad_img <- padImage(load.image(x))
-  res_img <- resize(pad_img,  size_x = 224, size_y = 224)
-  imager::save.image(res_img, y)
-}, x = files, y = new_names)
-```
-
-### Creating .rec files
-
-```{r}
-cat_files <- list.files("train_pad_224x224/cat/", recursive=TRUE)
-cat_files <- paste0("cat/", cat_files)
-
-dog_files <- list.files("train_pad_224x224/dog/", recursive=TRUE)
-dog_files <- paste0("dog/", dog_files)
-
-train_ind <- sample(length(cat_files), length(cat_files) * 0.8)
-train_data <- c(1:(length(train_ind) * 2))
-train_data <- cbind(train_data, c(rep(0, length(train_ind)), rep(1, length(train_ind))))
-train_data <- cbind(train_data, c(cat_files[train_ind], dog_files[train_ind]))
-train_data <- train_data[sample(nrow(train_data)),]
-write.table(train_data, "cats_dogs_train.lst", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
-im2rec("cats_dogs_train.lst", "train_pad_224x224/", "cats_dogs_train.rec")
-
-val_ind <- c(1:length(cat_files))[!c(1:length(cat_files)) %in% train_ind]
-val_data <- c(1:(length(val_ind) * 2))
-val_data <- cbind(val_data, c(rep(0, length(val_ind)), rep(1, length(val_ind))))
-val_data <- cbind(val_data, c(cat_files[val_ind], dog_files[val_ind]))
-val_data <- val_data[sample(nrow(val_data)),]
-write.table(val_data, "cats_dogs_val.lst", quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)
-im2rec("cats_dogs_val.lst", "train_pad_224x224/", "cats_dogs_val.rec")
-```
-
-## The data iterator
-
-```{r}
-get_iterator <- function(data_shape, train_data, val_data, batch_size = 128) {
-    train <- mx.io.ImageRecordIter(path.imgrec = train_data,
-                                   batch.size  = batch_size,
-                                   data.shape  = data_shape,
-                                   rand.crop   = TRUE,
-                                   rand.mirror = TRUE)
-  
-    val <- mx.io.ImageRecordIter(path.imgrec = val_data,
-                                 batch.size  = batch_size,
-                                 data.shape  = data_shape,
-                                 rand.crop   = FALSE,
-                                 rand.mirror = FALSE)
- 
-  return(list(train = train, val = val))
-}
-```
-
-
-```{r}
-data  <- get_iterator(data_shape = c(224, 224, 3),
-                      train_data = "cats_dogs_train.rec",
-                      val_data   = "cats_dogs_val.rec",
-                      batch_size = 8)
-train <- data$train
-val   <- data$val
-```
-
-
-## Load pretrained model
-
-Here we use the pretrained model from http://data.mxnet.io/mxnet/data/.
-There are 1000 classes in imagenet,
-and we need to replace the last fully connected layer with a new layer for 2 classes.
-
-
-```{r}
-download.file('http://data.mxnet.io/mxnet/data/Inception.zip', destfile = 'Inception.zip')
-unzip("Inception.zip")
-inception_bn <- mx.model.load("./Inception-BN", iteration = 126)
-
-symbol <- inception_bn$symbol
-# check symbol$arguments for layer names
-internals <- symbol$get.internals()
-outputs <- internals$outputs
-
-flatten <- internals$get.output(which(outputs == "flatten_output"))
-
-new_fc <- mx.symbol.FullyConnected(data = flatten, 
-                                   num_hidden = 2, 
-                                   name = "fc1") 
-# set name to original name in symbol$arguments
-new_soft <- mx.symbol.SoftmaxOutput(data = new_fc, 
-                                    name = "softmax")
-# set name to original name in symbol$arguments
-
-arg_params_new <- mx.model.init.params(symbol = new_soft,
-                                       input.shape = list("data" = c(224, 224, 3, 8)),
-                                       output.shape = NULL,
-                                       initializer = mx.init.uniform(0.1),
-                                       ctx = mx.cpu())$arg.params
-fc1_weights_new <- arg_params_new[["fc1_weight"]]
-fc1_bias_new <- arg_params_new[["fc1_bias"]]
-
-arg_params_new <- inception_bn$arg.params
-
-arg_params_new[["fc1_weight"]] <- fc1_weights_new 
-arg_params_new[["fc1_bias"]] <- fc1_bias_new 
-```
-
-
-## Fine-tuning
-
-```{r}
-model <- mx.model.FeedForward.create(
-  symbol             = new_soft,
-  X                  = train,
-  eval.data          = val,
-  ctx                = mx.gpu(0),
-  eval.metric        = mx.metric.accuracy,
-  num.round          = 2,
-  learning.rate      = 0.05,
-  momentum           = 0.9,
-  wd                 = 0.00001,
-  kvstore            = "local",
-  array.batch.size   = 128,
-  epoch.end.callback = mx.callback.save.checkpoint("inception_bn"),
-  batch.end.callback = mx.callback.log.train.metric(150),
-  initializer        = mx.init.Xavier(factor_type = "in", magnitude = 2.34),
-  optimizer          = "sgd",
-  arg.params         = arg_params_new,
-  aux.params         = inception_bn$aux.params
-)
-```
-## Making predictions
-
-```{r}
-preprocImage<- function(src, # URL or file location
-                        height = 224,        
-                        width = 224,  
-                        num_channels = 3, # 3 for RGB, 1 for grayscale
-                        mult_by = 1,      # set to 255 for normalized image
-                        crop = FALSE) {   # no crop by default
-  im <- load.image(src)
-
-  if (crop) {
-    shape <- dim(im)
-    short_edge <- min(shape[1:2])
-    xx <- floor((shape[1] - short_edge) / 2)
-    yy <- floor((shape[2] - short_edge) / 2)
-    im <- crop.borders(im, xx, yy)
-  }
-
-  resized <- resize(im,  size_x = width, size_y = height)
-  arr <- as.array(resized) * mult_by
-  dim(arr) <- c(width, height, num_channels, 1)
-  return(arr)
-} 
-```
-
-```{r}
-files <- list.files("./test_pad_224x224/")
-files <- paste0("./test_pad_224x224/", files)
-
-files <- split(files, rep(1:1250, each = 10))
-probs <- lapply(files, function(x) {
-  images <- lapply(x, preprocImage, mult_by = 255)
-  images <- do.call(abind, images)
-  probs <- predict(model, X = images, ctx = mx.gpu(0))
-})
-saveRDS(probs, "probs.rds")
-probs <- t(do.call(cbind, probs))
-
-preds <- data.frame(id = 1:12500, label = probs[, 2])
-write.csv(preds, "subm.csv", row.names = FALSE, quote = FALSE)
-```
-
-
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/R-package/vignettes/CustomIterator.Rmd b/R-package/vignettes/CustomIterator.Rmd
deleted file mode 100644
index b5a6576a5bc6..000000000000
--- a/R-package/vignettes/CustomIterator.Rmd
+++ /dev/null
@@ -1,207 +0,0 @@
-# Customized iterator
-
-
-This tutorial provides a guideline on how to use and write custom iterators, which can very useful when having a dataset that does not fit into memory.
-
-## Getting the data
-
-The data we are going to use is the [MNIST dataset](http://yann.lecun.com/exdb/mnist/) in CSV format, which can be found from [here](https://www.kaggle.com/c/digit-recognizer/data).
-
-To download the data:
-
-```{r}
-download.file('https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/mnist_csv.zip',
-              destfile = 'mnist_csv.zip')
-unzip('mnist_csv.zip', exdir = '.')
-```
-
-You'll get two files, `mnist_train.csv` that contains 60.000 examples of hand written numbers and `mxnist_test.csv` that contains 10.000 examples. The first element of each line in the CSV is the label, which is a number between 0 and 9. The rest of the line are 784 numbers between 0 and 255, corresponding to the levels of grey of a matrix of 28x28. Therefore, each line contains an image of 28x28 pixels of a hand written number and its true label.
-
-## Custom CSV Iterator
-
-Next we are going to create a custom CSV Iterator based on the [C++ CSVIterator class](https://github.com/dmlc/mxnet/blob/master/src/io/iter_csv.cc).
-
-For that we are going to use the R function `mx.io.CSVIter` as a base class. This class has as parameters `data.csv, data.shape, batch.size` and two main functions, `iter.next()` that calls the iterator in the next batch of data and `value()` that returns the train data and the label.
-
-The R Custom Iterator needs to inherit from the C++ data iterator class, for that we used the class `Rcpp_MXArrayDataIter` extracted with RCPP. Also, it needs to have the same parameters: `data.csv, data.shape, batch.size`. Apart from that, we can also add the field `iter`, which is the CSV Iterator that we are going to expand.
-
-```{r, eval=FALSE}
-CustomCSVIter <- setRefClass("CustomCSVIter",
-								fields=c("iter", "data.csv", "data.shape", "batch.size"),
-								contains = "Rcpp_MXArrayDataIter",
-								#...
-                            )
-```
-
-The next step is to initialize the class. For that we call the base `mx.io.CSVIter` and fill the rest of the fields.
-
-```{r, eval=FALSE}
-CustomCSVIter <- setRefClass("CustomCSVIter",
-								fields=c("iter", "data.csv", "data.shape", "batch.size"),
-								contains = "Rcpp_MXArrayDataIter",
-								methods=list(
-	                             	initialize=function(iter, data.csv, data.shape, batch.size){
-										feature_len <- data.shape*data.shape + 1
-										csv_iter <- mx.io.CSVIter(data.csv=data.csv, data.shape=c(feature_len), batch.size=batch.size)
-										.self$iter <- csv_iter
-										.self$data.csv <- data.csv
-										.self$data.shape <- data.shape
-										.self$batch.size <- batch.size
-										.self
-	                               	},
-                             	#...
-                             	)
-                            )
-```
-
-So far there is no difference between the original class and the custom class. Let's implement the function `value()`. In this case what we are going to do is transform the data that comes from the original class as an array of 785 numbers into a matrix of 28x28 and a label. We will also normalize the training data to be between 0 and 1.
-
-```{r, eval=FALSE}
-CustomCSVIter <- setRefClass("CustomCSVIter",
-								fields=c("iter", "data.csv", "data.shape", "batch.size"),
-								contains = "Rcpp_MXArrayDataIter",
-								methods=list(
-	                             	initialize=function(iter, data.csv, data.shape, batch.size){
-										feature_len <- data.shape*data.shape + 1
-										csv_iter <- mx.io.CSVIter(data.csv=data.csv, data.shape=c(feature_len), batch.size=batch.size)
-										.self$iter <- csv_iter
-										.self$data.csv <- data.csv
-										.self$data.shape <- data.shape
-										.self$batch.size <- batch.size
-										.self
-	                               	},
-									value=function(){
-										val <- as.array(.self$iter$value()$data)
-										val.x <- val[-1,]
-										val.y <- val[1,]
-										val.x <- val.x/255
-										dim(val.x) <- c(data.shape, data.shape, 1, ncol(val.x))
-										val.x <- mx.nd.array(val.x)
-										val.y <- mx.nd.array(val.y)
-										list(data=val.x, label=val.y)
-									},
-                             	#...
-                             	)
-                            )
-```
-Finally we are going to add the rest of the functions needed for the training to work correctly. The final `CustomCSVIter` looks like this:
-
-```{r}
-CustomCSVIter <- setRefClass("CustomCSVIter",
-								fields=c("iter", "data.csv", "data.shape", "batch.size"),
-								contains = "Rcpp_MXArrayDataIter",
-								methods=list(
-	                             	initialize=function(iter, data.csv, data.shape, batch.size){
-										feature_len <- data.shape*data.shape + 1
-										csv_iter <- mx.io.CSVIter(data.csv=data.csv, data.shape=c(feature_len), batch.size=batch.size)
-										.self$iter <- csv_iter
-										.self$data.csv <- data.csv
-										.self$data.shape <- data.shape
-										.self$batch.size <- batch.size
-										.self
-	                               	},
-									value=function(){
-										val <- as.array(.self$iter$value()$data)
-										val.x <- val[-1,]
-										val.y <- val[1,]
-										val.x <- val.x/255
-										dim(val.x) <- c(data.shape, data.shape, 1, ncol(val.x))
-										val.x <- mx.nd.array(val.x)
-										val.y <- mx.nd.array(val.y)
-										list(data=val.x, label=val.y)
-									},
-									iter.next=function(){
-										.self$iter$iter.next()
-									},
-									reset=function(){
-										.self$iter$reset()
-									},
-									num.pad=function(){
-										.self$iter$num.pad()
-									},
-									finalize=function(){
-										.self$iter$finalize()
-									}
-                             	)
-                            )
-```
-
-To call the class we can just do:
-
-```{r}
-batch.size <- 100
-train.iter <- CustomCSVIter$new(iter = NULL, data.csv = "mnist_train.csv", data.shape = 28, batch.size = batch.size)
-```
-
-## CNN Model
-
-
-For this tutorial we are going to use the known LeNet architecture:
-
-```{r}
-library(mxnet)
-lenet.model <- function(){
-  data <- mx.symbol.Variable('data')
-  conv1 <- mx.symbol.Convolution(data=data, kernel=c(5,5), num_filter=20) #first conv
-  tanh1 <- mx.symbol.Activation(data=conv1, act_type="tanh")
-  pool1 <- mx.symbol.Pooling(data=tanh1, pool_type="max", kernel=c(2,2), stride=c(2,2))
-  conv2 <- mx.symbol.Convolution(data=pool1, kernel=c(5,5), num_filter=50)# second conv
-  tanh2 <- mx.symbol.Activation(data=conv2, act_type="tanh")
-  pool2 <- mx.symbol.Pooling(data=tanh2, pool_type="max", kernel=c(2,2), stride=c(2,2))
-  flatten <- mx.symbol.Flatten(data=pool2)
-  fc1 <- mx.symbol.FullyConnected(data=flatten, num_hidden=100) # first fullc
-  tanh3 <- mx.symbol.Activation(data=fc1, act_type="tanh")
-  fc2 <- mx.symbol.FullyConnected(data=tanh3, num_hidden=10) # second fullc
-  network <- mx.symbol.SoftmaxOutput(data=fc2) # loss
-  network
-}
-network <- lenet.model()
-```
-
-## Training with the Custom Iterator
-
-Finally, we can directly add the custom iterator as the training data source.
-
-```{r, eval=FALSE}
-model <- mx.model.FeedForward.create(symbol=network,
-                                     X=train.iter,
-                                     ctx=mx.gpu(0),
-                                     num.round=10,
-                                     array.batch.size=batch.size,
-                                     learning.rate=0.1,
-                                     momentum=0.9,  
-                                     eval.metric=mx.metric.accuracy,
-                                     wd=0.00001,
-                                     batch.end.callback=mx.callback.log.speedometer(batch.size, frequency = 100)
-                                     )
-```
-
-The last 2 iterations with a K80 GPU looks like this:
-
-```
-## [8] Train-accuracy=0.998866666666667
-## Batch [100] Speed: 15413.0104454713 samples/sec Train-accuracy=0.999
-## Batch [200] Speed: 16629.3412459049 samples/sec Train-accuracy=0.99935
-## Batch [300] Speed: 18412.6900509319 samples/sec Train-accuracy=0.9995
-## Batch [400] Speed: 16757.2882328335 samples/sec Train-accuracy=0.999425
-## Batch [500] Speed: 17116.6529207406 samples/sec Train-accuracy=0.99946
-## Batch [600] Speed: 19627.589505195 samples/sec Train-accuracy=0.99945
-## [9] Train-accuracy=0.9991
-## Batch [100] Speed: 18971.5745536982 samples/sec Train-accuracy=0.9992
-## Batch [200] Speed: 15554.8822435383 samples/sec Train-accuracy=0.99955
-## Batch [300] Speed: 18327.6950115053 samples/sec Train-accuracy=0.9997
-## Batch [400] Speed: 17103.0705411788 samples/sec Train-accuracy=0.9997
-## Batch [500] Speed: 15104.8656902394 samples/sec Train-accuracy=0.99974
-## Batch [600] Speed: 13818.7899518255 samples/sec Train-accuracy=0.99975
-## [10] Train-accuracy=0.99975
-```
-
-## Conclusion
-
-
-We have shown how to create a custom CSV Iterator by extending the class `mx.io.CSVIter`. In our class, we iteratively read from a CSV file a batch of data that will be transformed and then processed in the stochastic gradient descent optimization. That way, we are able to manage CSV files that are bigger than the memory of the machine we are using.
-
-Based of this custom iterator, we can also create data loaders that internally transform or expand the data, allowing to manage files of any size.
-
-
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/R-package/vignettes/CustomLossFunction.Rmd b/R-package/vignettes/CustomLossFunction.Rmd
deleted file mode 100644
index 85e882567f8e..000000000000
--- a/R-package/vignettes/CustomLossFunction.Rmd
+++ /dev/null
@@ -1,151 +0,0 @@
-# Customized loss function
-
-This tutorial provides guidelines for using customized loss function in network construction.
-
-## Model Training Example
-
-Let's begin with a small regression example. We can build and train a regression model with the following code:
-
-```{r}
-data(BostonHousing, package = "mlbench")
-BostonHousing[, sapply(BostonHousing, is.factor)] <-
-  as.numeric(as.character(BostonHousing[, sapply(BostonHousing, is.factor)]))
-BostonHousing <- data.frame(scale(BostonHousing))
-
-test.ind = seq(1, 506, 5)    # 1 pt in 5 used for testing
-train.x = data.matrix(BostonHousing[-test.ind,-14])
-train.y = BostonHousing[-test.ind, 14]
-test.x = data.matrix(BostonHousing[--test.ind,-14])
-test.y = BostonHousing[--test.ind, 14]
-
-require(mxnet)
-
-data <- mx.symbol.Variable("data")
-label <- mx.symbol.Variable("label")
-fc1 <- mx.symbol.FullyConnected(data, num_hidden = 14, name = "fc1")
-tanh1 <- mx.symbol.Activation(fc1, act_type = "tanh", name = "tanh1")
-fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden = 1, name = "fc2")
-lro <- mx.symbol.LinearRegressionOutput(fc2, name = "lro")
-
-mx.set.seed(0)
-model <- mx.model.FeedForward.create(lro, X = train.x, y = train.y,
-                                     ctx = mx.cpu(),
-                                     num.round = 5,
-                                     array.batch.size = 60,
-                                     optimizer = "rmsprop",
-                                     verbose = TRUE,
-                                     array.layout = "rowmajor",
-                                     batch.end.callback = NULL,
-                                     epoch.end.callback = NULL)
-
-pred <- predict(model, test.x)
-sum((test.y - pred[1,])^2) / length(test.y)
-```
-
-Besides the `LinearRegressionOutput`, we also provide `LogisticRegressionOutput` and `MAERegressionOutput`.
-However, this might not be enough for real-world models. You can provide your own loss function
-by using `mx.symbol.MakeLoss` when constructing the network.
-
-## How to Use Your Own Loss Function
-
-We still use our previous example, but this time we use `mx.symbol.MakeLoss` to minimize the `(pred-label)^2`
-
-```{r}
-data <- mx.symbol.Variable("data")
-label <- mx.symbol.Variable("label")
-fc1 <- mx.symbol.FullyConnected(data, num_hidden = 14, name = "fc1")
-tanh1 <- mx.symbol.Activation(fc1, act_type = "tanh", name = "tanh1")
-fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden = 1, name = "fc2")
-lro2 <- mx.symbol.MakeLoss(mx.symbol.square(mx.symbol.Reshape(fc2, shape = 0) - label), name="lro2")
-```
-
-Then we can train the network just as usual.
-
-```{r}
-mx.set.seed(0)
-model2 <- mx.model.FeedForward.create(lro2, X = train.x, y = train.y,
-                                      ctx = mx.cpu(),
-                                      num.round = 5,
-                                      array.batch.size = 60,
-                                      optimizer = "rmsprop",
-                                      verbose = TRUE,
-                                      array.layout = "rowmajor",
-                                      batch.end.callback = NULL,
-                                      epoch.end.callback = NULL)
-```
-
-We should get very similar results because we are actually minimizing the same loss function.
-However, the result is quite different.
-
-```{r}
-pred2 <- predict(model2, test.x)
-sum((test.y - pred2)^2) / length(test.y)
-```
-
-This is because output of `mx.symbol.MakeLoss` is the gradient of loss with respect to the input data.
-We can get the real prediction as below.
-
-```{r}
-internals = internals(model2$symbol)
-fc_symbol = internals[[match("fc2_output", outputs(internals))]]
-
-model3 <- list(symbol = fc_symbol,
-               arg.params = model2$arg.params,
-               aux.params = model2$aux.params)
-
-class(model3) <- "MXFeedForwardModel"
-
-pred3 <- predict(model3, test.x)
-sum((test.y - pred3[1,])^2) / length(test.y)
-```
-
-We have provided many operations on the symbols. An example of `|pred-label|` can be found below.
-
-```{r}
-lro_abs <- mx.symbol.MakeLoss(mx.symbol.abs(mx.symbol.Reshape(fc2, shape = 0) - label))
-mx.set.seed(0)
-model4 <- mx.model.FeedForward.create(lro_abs, X = train.x, y = train.y,
-                                      ctx = mx.cpu(),
-                                      num.round = 20,
-                                      array.batch.size = 60,
-                                      optimizer = "sgd",
-                                      learning.rate = 0.001,
-                                      verbose = TRUE,
-                                      array.layout = "rowmajor",
-                                      batch.end.callback = NULL,
-                                      epoch.end.callback = NULL)
-
-internals = internals(model4$symbol)
-fc_symbol = internals[[match("fc2_output", outputs(internals))]]
-
-model5 <- list(symbol = fc_symbol,
-               arg.params = model4$arg.params,
-               aux.params = model4$aux.params)
-
-class(model5) <- "MXFeedForwardModel"
-
-pred5 <- predict(model5, test.x)
-sum(abs(test.y - pred5[1,])) / length(test.y)
-```
-
-
-```{r}
-lro_mae <- mx.symbol.MAERegressionOutput(fc2, name = "lro")
-mx.set.seed(0)
-model6 <- mx.model.FeedForward.create(lro_mae, X = train.x, y = train.y,
-                                      ctx = mx.cpu(),
-                                      num.round = 20,
-                                      array.batch.size = 60,
-                                      optimizer = "sgd",
-                                      learning.rate = 0.001,
-                                      verbose = TRUE,
-                                      array.layout = "rowmajor",
-                                      batch.end.callback = NULL,
-                                      epoch.end.callback = NULL)
-pred6 <- predict(model6, test.x)
-sum(abs(test.y - pred6[1,])) / length(test.y)
-```
-
-We got the same result as expected.
-
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd b/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd
deleted file mode 100644
index fb023bb5435f..000000000000
--- a/R-package/vignettes/fiveMinutesNeuralNetwork.Rmd
+++ /dev/null
@@ -1,173 +0,0 @@
-# Neural Network with MXNet in Five Minutes
-
-This is the first tutorial for new users of the R package `mxnet`. You will learn to construct a neural network to do regression in 5 minutes.
-
-We will show you how to do classification and regression tasks respectively. The data we use comes from the package `mlbench`.
-
-## Classification
-
-First of all, let us load in the data and preprocess it:
-
-```{r}
-require(mlbench)
-require(mxnet)
-
-data(Sonar, package = "mlbench")
-
-Sonar[,61] <- as.numeric(Sonar[,61])-1
-train.ind <- c(1:50, 100:150)
-train.x <- data.matrix(Sonar[train.ind, 1:60])
-train.y <- Sonar[train.ind, 61]
-test.x <- data.matrix(Sonar[-train.ind, 1:60])
-test.y <- Sonar[-train.ind, 61]
-```
-
-Next we are going to use a multi-layer perceptron (MLP) as our classifier.
-In `mxnet`, we have a function called `mx.mlp` so that users can build a general multi-layer neural network to do classification (`out_activation="softmax"`) or regression (`out_activation="rmse"`).
-Note for the `softmax` activation, the output is zero-indexed not one-indexed. In the data we use:
-
-```{r}
-table(train.y)
-table(test.y)
-```
-
-There are several parameters we have to feed to `mx.mlp`:
-
-- Training data and label.
-- Number of hidden nodes in each hidden layers.
-- Number of nodes in the output layer.
-- Type of the activation.
-- Type of the output loss.
-- The device to train `mx.gpu()` for GPU or `mx.cpu()` for CPU.
-- Other parameters for `mx.model.FeedForward.create`.
-
-The following code piece is showing a possible usage of `mx.mlp`:
-
-```{r}
-mx.set.seed(0)
-model <- mx.mlp(train.x, train.y, hidden_node=10, out_node=2, out_activation="softmax",
-                num.round=20, array.batch.size=15, learning.rate=0.07, momentum=0.9, 
-                eval.metric=mx.metric.accuracy)
-```
-
-Note that `mx.set.seed` is the correct function to control the random process in `mxnet`. You can see the accuracy in each round during training. It is also easy to make prediction and evaluate.
-
-To get an idea of what is happening, we can easily view the computation graph from R.
-
-```{r}
-graph.viz(model$symbol)
-```
-
-```{r}
-preds <- predict(model, test.x)
-pred.label <- max.col(t(preds)) - 1
-table(pred.label, test.y)
-```
-
-Note for multi-class prediction, mxnet outputs `nclass` x `nexamples`, each each row corresponding to probability of that class.
-
-## Regression
-
-Again, let us preprocess the data first.
-
-```{r}
-data(BostonHousing, package="mlbench")
-
-train.ind <- seq(1, 506, 3)
-train.x <- data.matrix(BostonHousing[train.ind, -14])
-train.y <- BostonHousing[train.ind, 14]
-test.x <- data.matrix(BostonHousing[-train.ind, -14])
-test.y <- BostonHousing[-train.ind, 14]
-```
-
-Although we can use `mx.mlp` again to do regression by changing the `out_activation`, this time we are going to introduce a flexible way to configure neural networks in `mxnet`. The configuration is done by the "Symbol" system in `mxnet`, which takes care of the links among nodes, the activation, dropout ratio, etc. To configure a multi-layer neural network, we can do it in the following way:
-
-```{r}
-# Define the input data
-data <- mx.symbol.Variable("data")
-# A fully connected hidden layer
-# data: input source
-# num_hidden: number of neurons in this hidden layer
-fc1 <- mx.symbol.FullyConnected(data, num_hidden=1)
-
-# Use linear regression for the output layer
-lro <- mx.symbol.LinearRegressionOutput(fc1)
-```
-
-What matters for a regression task is mainly the last function, this enables the new network to optimize for squared loss. We can now train on this simple data set. In this configuration, we dropped the hidden layer so the input layer is directly connected to the output layer.
-
-next we can make prediction with this structure and other parameters with `mx.model.FeedForward.create`:
-
-```{r}
-mx.set.seed(0)
-model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y,
-                                     ctx=mx.cpu(), num.round=50, array.batch.size=20,
-                                     learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse)
-```
-
-It is also easy to make prediction and evaluate
-
-```{r}
-preds <- predict(model, test.x)
-sqrt(mean((preds-test.y)^2))
-```
-
-Currently we have four pre-defined metrics "accuracy", "rmse", "mae" and "rmsle". One might wonder how to customize the evaluation metric. `mxnet` provides the interface for users to define their own metric of interests:
-
-```{r}
-demo.metric.mae <- mx.metric.custom("mae", function(label, pred) {
-  res <- mean(abs(label-pred))
-  return(res)
-})
-```
-
-This is an example for mean absolute error. We can simply plug it in the training function:
-
-```{r}
-mx.set.seed(0)
-model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y,
-                                     ctx=mx.cpu(), num.round=50, array.batch.size=20,
-                                     learning.rate=2e-6, momentum=0.9, eval.metric=demo.metric.mae)
-```
-
-In the previous example, our target is to predict the last column ("medv") in the dataset.
-It is also possible to build a regression model with multiple outputs.
-This time we use the last two columns as the targets:
-
-```{r}
-train.x <- data.matrix(BostonHousing[train.ind, -(13:14)])
-train.y <- BostonHousing[train.ind, c(13:14)]
-test.x <- data.matrix(BostonHousing[-train.ind, -(13:14)])
-test.y <- BostonHousing[-train.ind, c(13:14)]
-```
-
-and build a similar network symbol:
-
-```{r}
-data <- mx.symbol.Variable("data")
-fc2 <- mx.symbol.FullyConnected(data, num_hidden=2)
-lro2 <- mx.symbol.LinearRegressionOutput(fc2)
-```
-
-We use `mx.io.arrayiter` to build an iter for our training set and train the model using `mx.model.FeedForward.create`:
-
-```{r}
-mx.set.seed(0)
-train_iter = mx.io.arrayiter(data = t(train.x), label = t(train.y))
-
-model <- mx.model.FeedForward.create(lro2, X=train_iter,
-                                     ctx=mx.cpu(), num.round=50, array.batch.size=20,
-                                     learning.rate=2e-6, momentum=0.9)
-```
-
-After training, we can see that the dimension of the prediction is the same with our target.
-
-```{r}
-preds <- t(predict(model, test.x))
-dim(preds)
-dim(test.y)
-```
-Congratulations! Now you have learnt the basic for using `mxnet`. Please check the other tutorials for advanced features.
-
-
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/R-package/vignettes/mnistCompetition.Rmd b/R-package/vignettes/mnistCompetition.Rmd
deleted file mode 100644
index 055f1ae51d7e..000000000000
--- a/R-package/vignettes/mnistCompetition.Rmd
+++ /dev/null
@@ -1,246 +0,0 @@
-# Handwritten Digits Classification Competition
-
-[MNIST](http://yann.lecun.com/exdb/mnist/) is a handwritten digits image data set created by Yann LeCun. Every digit is represented by a 28x28 image. It has become a standard data set to test classifiers on simple image input. Neural network is no doubt a strong model for image classification tasks. There's a [long-term hosted competition](https://www.kaggle.com/c/digit-recognizer) on Kaggle using this data set.
-We will present the basic usage of [mxnet](https://github.com/dmlc/mxnet/tree/master/R-package) to compete in this challenge.
-
-## Data Loading
-
-First, let us download the data from [here](https://www.kaggle.com/c/digit-recognizer/data), and put them under the `data/` folder in your working directory.
-
-Then we can read them in R and convert to matrices.
-
-```{r, echo=FALSE}
-download.file('https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/mnist_csv.zip', destfile = 'mnist_csv.zip')
-unzip('mnist_csv.zip', exdir = '.')
-```
-
-
-```{r}
-require(mxnet)
-train <- read.csv("train.csv", header=TRUE)
-test <- read.csv("test.csv", header=TRUE)
-train <- data.matrix(train)
-test <- data.matrix(test)
-
-train.x <- train[,-1]
-train.y <- train[,1]
-```
-
-Besides using the csv files from kaggle, you can also read the orginal MNIST dataset into R.
-
-```{r, eval=FALSE}
-load_image_file <- function(filename) {
-  f = file(filename, 'rb')
-  readBin(f, 'integer', n = 1, size = 4, endian = 'big')
-  n = readBin(f,'integer', n = 1, size = 4, endian = 'big')
-  nrow = readBin(f,'integer', n = 1, size = 4, endian = 'big')
-  ncol = readBin(f,'integer', n = 1, size = 4, endian = 'big')
-  x = readBin(f, 'integer', n = n * nrow * ncol, size = 1, signed = F)
-  x = matrix(x, ncol = nrow * ncol, byrow = T)
-  close(f)
-  x
-}
-
-load_label_file <- function(filename) {
-  f = file(filename, 'rb')
-  readBin(f,'integer', n = 1, size = 4, endian = 'big')
-  n = readBin(f,'integer', n = 1, size = 4, endian = 'big')
-  y = readBin(f,'integer', n = n, size = 1, signed = F)
-  close(f)
-  y
-}
-
-train.x <- load_image_file('mnist/train-images-idx3-ubyte')
-test.y <- load_image_file('mnist/t10k-images-idx3-ubyte')
-
-train.y <- load_label_file('mnist/train-labels-idx1-ubyte')
-test.y <- load_label_file('mnist/t10k-labels-idx1-ubyte')
-```
-
-Here every image is represented as a single row in train/test. The greyscale of each image falls in the range [0, 255], we can linearly transform it into [0,1] by
-
-```{r}
-train.x <- t(train.x/255)
-test <- t(test/255)
-```
-We also transpose the input matrix to npixel x nexamples, which is the column major format accepted by mxnet (and the convention of R).
-
-In the label part, we see the number of each digit is fairly even:
-
-```{r}
-table(train.y)
-```
-
-## Network Configuration
-
-Now we have the data. The next step is to configure the structure of our network.
-
-```{r}
-data <- mx.symbol.Variable("data")
-fc1 <- mx.symbol.FullyConnected(data, name="fc1", num_hidden=128)
-act1 <- mx.symbol.Activation(fc1, name="relu1", act_type="relu")
-fc2 <- mx.symbol.FullyConnected(act1, name="fc2", num_hidden=64)
-act2 <- mx.symbol.Activation(fc2, name="relu2", act_type="relu")
-fc3 <- mx.symbol.FullyConnected(act2, name="fc3", num_hidden=10)
-softmax <- mx.symbol.SoftmaxOutput(fc3, name="sm")
-```
-
-1. In `mxnet`, we use its own data type `symbol` to configure the network. `data <- mx.symbol.Variable("data")` use `data` to represent the input data, i.e. the input layer.
-2. Then we set the first hidden layer by `fc1 <- mx.symbol.FullyConnected(data, name="fc1", num_hidden=128)`. This layer has `data` as the input, its name and the number of hidden neurons.
-3. The activation is set by `act1 <- mx.symbol.Activation(fc1, name="relu1", act_type="relu")`. The activation function takes the output from the first hidden layer `fc1`.
-4. The second hidden layer takes the result from `act1` as the input, with its name as "fc2" and the number of hidden neurons as 64.
-5. the second activation is almost the same as `act1`, except we have a different input source and name.
-6. Here comes the output layer. Since there's only 10 digits, we set the number of neurons to 10.
-7. Finally we set the activation to softmax to get a probabilistic prediction.
-
-If you are a big fan of the `%>%` operator, you can also define the network as below:
-
-```{r, eval=FALSE}
-library(magrittr)
-softmax <- mx.symbol.Variable("data") %>%
-  mx.symbol.FullyConnected(name = "fc1", num_hidden = 128) %>%
-  mx.symbol.Activation(name = "relu1", act_type = "relu") %>%
-  mx.symbol.FullyConnected(name = "fc2", num_hidden = 64) %>%
-  mx.symbol.Activation(name = "relu2", act_type = "relu") %>%
-  mx.symbol.FullyConnected(name="fc3", num_hidden=10) %>%
-  mx.symbol.SoftmaxOutput(name="sm")
-```
-
-## Training
-
-We are almost ready for the training process. Before we start the computation, let's decide what device should we use.
-
-```{r}
-devices <- mx.cpu()
-```
-
-Here we assign CPU to `mxnet`. After all these preparation, you can run the following command to train the neural network! Note that `mx.set.seed` is the correct function to control the random process in `mxnet`.
-
-```{r}
-mx.set.seed(0)
-model <- mx.model.FeedForward.create(softmax, X = train.x, y = train.y,
-                                     ctx = devices, num.round = 5,
-                                     array.batch.size = 100,
-                                     learning.rate = 0.07, momentum = 0.9,
-                                     eval.metric = mx.metric.accuracy,
-                                     initializer = mx.init.uniform(0.07),
-                                     batch.end.callback = mx.callback.log.train.metric(100))
-```
-
-## Prediction and Submission
-
-To make prediction, we can simply write
-
-```{r}
-preds <- predict(model, test)
-dim(preds)
-```
-
-It is a matrix with 28000 rows and 10 cols, containing the desired classification probabilities from the output layer. To extract the maximum label for each row, we can use the `max.col` in R:
-
-```{r}
-pred.label <- max.col(t(preds)) - 1
-table(pred.label)
-```
-
-With a little extra effort in the csv format, we can have our submission to the competition!
-
-```{r, eval = FALSE}
-submission <- data.frame(ImageId=1:ncol(test), Label=pred.label)
-write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE)
-```
-
-## LeNet
-
-Next we are going to introduce a new network structure: [LeNet](http://yann.lecun.com/exdb/lenet/). It is proposed by Yann LeCun to recognize handwritten digits. Now we are going to demonstrate how to construct and train an LeNet in `mxnet`.
-
-
-First we construct the network:
-
-```{r}
-require(mxnet)
-# input
-data <- mx.symbol.Variable('data')
-# first conv
-conv1 <- mx.symbol.Convolution(data=data, kernel=c(5,5), num_filter=20)
-tanh1 <- mx.symbol.Activation(data=conv1, act_type="tanh")
-pool1 <- mx.symbol.Pooling(data=tanh1, pool_type="max",
-                          kernel=c(2,2), stride=c(2,2))
-# second conv
-conv2 <- mx.symbol.Convolution(data=pool1, kernel=c(5,5), num_filter=50)
-tanh2 <- mx.symbol.Activation(data=conv2, act_type="tanh")
-pool2 <- mx.symbol.Pooling(data=tanh2, pool_type="max",
-                          kernel=c(2,2), stride=c(2,2))
-# first fullc
-flatten <- mx.symbol.Flatten(data=pool2)
-fc1 <- mx.symbol.FullyConnected(data=flatten, num_hidden=500)
-tanh3 <- mx.symbol.Activation(data=fc1, act_type="tanh")
-# second fullc
-fc2 <- mx.symbol.FullyConnected(data=tanh3, num_hidden=10)
-# loss
-lenet <- mx.symbol.SoftmaxOutput(data=fc2)
-```
-
-Then let us reshape the matrices into arrays:
-
-```{r}
-train.array <- train.x
-dim(train.array) <- c(28, 28, 1, ncol(train.x))
-test.array <- test
-dim(test.array) <- c(28, 28, 1, ncol(test))
-```
-
-Next we are going to compare the training speed on different devices, so the definition of the devices goes first:
-
-```{r}
-n.gpu <- 1
-device.cpu <- mx.cpu()
-device.gpu <- lapply(0:(n.gpu-1), function(i) {
-  mx.gpu(i)
-})
-```
-
-As you can see, we can pass a list of devices, to ask mxnet to train on multiple GPUs (you can do similar thing for cpu,
-but since internal computation of cpu is already multi-threaded, there is less gain than using GPUs).
-
-We start by training on CPU first. Because it takes a bit time to do so, we will only run it for one iteration.
-
-```{r}
-mx.set.seed(0)
-tic <- proc.time()
-model <- mx.model.FeedForward.create(lenet, X = train.array, y = train.y,
-                                     ctx = device.cpu, num.round = 1,
-                                     array.batch.size = 100,
-                                     learning.rate = 0.05, momentum = 0.9, wd = 0.00001,
-                                     eval.metric = mx.metric.accuracy,
-                                     batch.end.callback = mx.callback.log.train.metric(100))
-print(proc.time() - tic)
-```
-
-Training on GPU:
-
-```{r}
-mx.set.seed(0)
-tic <- proc.time()
-model <- mx.model.FeedForward.create(lenet, X = train.array, y = train.y,
-                                     ctx = device.gpu, num.round = 5,
-                                     array.batch.size = 100,
-                                     learning.rate = 0.05, momentum = 0.9, wd = 0.00001,
-                                     eval.metric = mx.metric.accuracy,
-                                     batch.end.callback = mx.callback.log.train.metric(100))
-print(proc.time() - tic)
-```
-
-As you can see by using GPU, we can get a much faster speedup in training!
-Finally we can submit the result to Kaggle again to see the improvement of our ranking!
-
-```{r, eval = FALSE}
-preds <- predict(model, test.array)
-pred.label <- max.col(t(preds)) - 1
-submission <- data.frame(ImageId=1:ncol(test), Label=pred.label)
-write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE)
-```
-
-![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/knitr/mnistCompetition-kaggle-submission.png)
-
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
diff --git a/R-package/vignettes/symbol.Rmd b/R-package/vignettes/symbol.Rmd
deleted file mode 100644
index 228c6b26606c..000000000000
--- a/R-package/vignettes/symbol.Rmd
+++ /dev/null
@@ -1,103 +0,0 @@
-# Symbol and Automatic Differentiation
-
-The computational unit `NDArray` requires a way to construct neural networks. MXNet provides a symbolic interface, named Symbol, to do this. Symbol combines both flexibility and efficiency.
-
-## Basic Composition of Symbols
-
-The following code creates a two-layer perceptron network:
-
-
-```{r}
-require(mxnet)
-net <- mx.symbol.Variable("data")
-net <- mx.symbol.FullyConnected(data=net, name="fc1", num_hidden=128)
-net <- mx.symbol.Activation(data=net, name="relu1", act_type="relu")
-net <- mx.symbol.FullyConnected(data=net, name="fc2", num_hidden=64)
-net <- mx.symbol.Softmax(data=net, name="out")
-class(net)
-```
-
-
-Each symbol takes a (unique) string name. *Variable* often defines the inputs,
-or free variables. Other symbols take a symbol as the input (*data*),
-and may accept other hyper parameters, such as the number of hidden neurons (*num_hidden*)
-or the activation type (*act_type*).
-
-A symbol can be viewed as a function that takes several arguments, whose
-names are automatically generated and can be retrieved with the following command:
-
-
-```{r}
-arguments(net)
-```
-
-The arguments are the parameters need by each symbol:
-
-- *data*: Input data needed by the variable *data*
-- *fc1_weight* and *fc1_bias*: The weight and bias for the first fully connected layer, *fc1*
-- *fc2_weight* and *fc2_bias*: The weight and bias for the second fully connected layer, *fc2*
-- *out_label*: The label needed by the loss
-
-We can also specify the automatically generated names explicitly:
-
-
-```{r}
-data <- mx.symbol.Variable("data")
-w <- mx.symbol.Variable("myweight")
-net <- mx.symbol.FullyConnected(data=data, weight=w, name="fc1", num_hidden=128)
-arguments(net)
-```
-
-## More Complicated Composition of Symbols
-
-MXNet provides well-optimized symbols for
-commonly used layers in deep learning. You can also define new operators
-in Python. The following example first performs an element-wise add between two
-symbols, then feeds them to the fully connected operator:
-
-
-```{r}
-lhs <- mx.symbol.Variable("data1")
-rhs <- mx.symbol.Variable("data2")
-net <- mx.symbol.FullyConnected(data=lhs + rhs, name="fc1", num_hidden=128)
-arguments(net)
-```
-
-We can construct a symbol more flexibly than by using the single
-forward composition, for example:
-
-```{r}
-net <- mx.symbol.Variable("data")
-net <- mx.symbol.FullyConnected(data=net, name="fc1", num_hidden=128)
-net2 <- mx.symbol.Variable("data2")
-net2 <- mx.symbol.FullyConnected(data=net2, name="net2", num_hidden=128)
-composed.net <- mx.apply(net, data=net2, name="compose")
-arguments(composed.net)
-```
-
-In the example, *net* is used as a function to apply to an existing symbol
-*net*. The resulting *composed.net* will replace the original argument *data* with
-*net2* instead.
-
-## Training a Neural Net
-
-The [model API](../../../R-package/R/model.R) is a thin wrapper around the symbolic executors to support neural net training.
-
-We encourage you to read [Symbolic Configuration and Execution in Pictures for python package](../python/symbol_in_pictures.md)for a detailed explanation of concepts in pictures.
-
-## How Efficient Is the Symbolic API?
-
-The Symbolic API brings the efficient C++
-operations in powerful toolkits, such as CXXNet and Caffe, together with the
-flexible dynamic NDArray operations. All of the memory and computation resources are
-allocated statically during bind operations, to maximize runtime performance and memory
-utilization.
-
-The coarse-grained operators are equivalent to CXXNet layers, which are
-extremely efficient.  We also provide fine-grained operators for more flexible
-composition. Because MXNet does more in-place memory allocation, it can
-be more memory efficient than CXXNet and gets to the same runtime with
-greater flexibility.
-
-
-<!-- INSERT SOURCE DOWNLOAD BUTTONS -->
\ No newline at end of file
diff --git a/amalgamation/mxnet_predict0.cc b/amalgamation/mxnet_predict0.cc
index a18e28fd57de..f9bf45adfa19 100644
--- a/amalgamation/mxnet_predict0.cc
+++ b/amalgamation/mxnet_predict0.cc
@@ -81,7 +81,6 @@
 #include "src/operator/leaky_relu.cc"
 #include "src/operator/nn/pooling.cc"
 #include "src/operator/nn/softmax_activation.cc"
-#include "src/operator/softmax_output.cc"
 #include "src/operator/tensor/elemwise_binary_broadcast_op_basic.cc"
 #include "src/operator/tensor/elemwise_binary_op.cc"
 #include "src/operator/tensor/elemwise_binary_op_basic.cc"
diff --git a/benchmark/opperf/nd_operations/nn_basic_operators.py b/benchmark/opperf/nd_operations/nn_basic_operators.py
index f3007bac188c..d669b83d653e 100644
--- a/benchmark/opperf/nd_operations/nn_basic_operators.py
+++ b/benchmark/opperf/nd_operations/nn_basic_operators.py
@@ -29,11 +29,6 @@
 1. FullyConnected
 2. Dropout
 3. BatchNorm
-4. SoftmaxOutput
-5. LinearRegressionOutput
-6. LogisticRegressionOutput
-7. MAERegressionOutput
-8. SVMOutput
 9. L2Normalization
 10. LayerNorm
 11. InstanceNorm
diff --git a/benchmark/opperf/rules/default_params.py b/benchmark/opperf/rules/default_params.py
index 04ec8100b1b4..4e8bb6b6cc6f 100644
--- a/benchmark/opperf/rules/default_params.py
+++ b/benchmark/opperf/rules/default_params.py
@@ -182,14 +182,7 @@
 # L2Normalization
 DEFAULT_MODE_L2 = ['channel', 'instance', 'spatial']
 
-# SVMOutput
-DEFAULT_LABEL_SVM = [(32, 3, 256), (32, 3, 10000)]
-
 DEFAULT_DATA_SVM_LARGE_TENSOR = [(2**29, 2, 2, 2)]
-DEFAULT_LABEL_SVM_LARGE_TENSOR = [(2**29, 2, 2)]
-
-# SoftmaxOutput
-DEFAULT_LABEL_SM = [(32, 3, 256), (32, 3, 10000)]
 
 DEFAULT_DATA_SO_LARGE_TENSOR = [(2**29, 2, 2, 2)]
 DEFAULT_LABEL_SO_LARGE_TENSOR = [(2**29, 2, 2)]
@@ -537,16 +530,6 @@
                    "moving_mean_batchnorm": DEFAULT_MOVING_MEAN,
                    "moving_var_batchnorm": DEFAULT_MOVING_VAR,
                    "axis_batchnorm": DEFAULT_AXIS_BN,
-                   "data_softmaxoutput": DEFAULT_DATA_NN_BASIC,
-                   "label_softmaxoutput": DEFAULT_LABEL_SM,
-                   "data_maeregressionoutput": DEFAULT_DATA_NN_BASIC,
-                   "label_maeregressionoutput": DEFAULT_LABEL_REG,
-                   "data_logisticregressionoutput": DEFAULT_DATA_NN_BASIC,
-                   "label_logisticregressionoutput": DEFAULT_LABEL_REG,
-                   "data_linearregressionoutput": DEFAULT_DATA_NN_BASIC,
-                   "label_linearregressionoutput": DEFAULT_LABEL_REG,
-                   "data_svmoutput": DEFAULT_DATA_NN_BASIC,
-                   "label_svmoutput": DEFAULT_LABEL_SVM,
                    "grad_scale": DEFAULT_GRAD_SCALE,
                    "normalization": DEFAULT_NORMALIZATION,
                    "margin": DEFAULT_MARGIN,
@@ -751,16 +734,6 @@
                                 "moving_mean_batchnorm": DEFAULT_MOVING_MEAN_LARGE_TENSOR,
                                 "moving_var_batchnorm": DEFAULT_MOVING_VAR_LARGE_TENSOR,
                                 "axis_batchnorm": DEFAULT_AXIS_BN,
-                                "data_softmaxoutput": DEFAULT_DATA_SO_LARGE_TENSOR,
-                                "label_softmaxoutput": DEFAULT_LABEL_SO_LARGE_TENSOR,
-                                "data_maeregressionoutput": DEFAULT_DATA_REG_LARGE_TENSOR,
-                                "label_maeregressionoutput": DEFAULT_LABEL_REG_LARGE_TENSOR,
-                                "data_logisticregressionoutput": DEFAULT_DATA_REG_LARGE_TENSOR,
-                                "label_logisticregressionoutput": DEFAULT_LABEL_REG_LARGE_TENSOR,
-                                "data_linearregressionoutput": DEFAULT_DATA_REG_LARGE_TENSOR,
-                                "label_linearregressionoutput": DEFAULT_LABEL_REG_LARGE_TENSOR,
-                                "data_svmoutput": DEFAULT_DATA_SVM_LARGE_TENSOR,
-                                "label_svmoutput": DEFAULT_LABEL_SVM_LARGE_TENSOR,
                                 "grad_scale": DEFAULT_GRAD_SCALE,
                                 "normalization": DEFAULT_NORMALIZATION,
                                 "margin": DEFAULT_MARGIN,
diff --git a/benchmark/opperf/utils/benchmark_utils.py b/benchmark/opperf/utils/benchmark_utils.py
index f2cce0abec09..b3bf8213569a 100644
--- a/benchmark/opperf/utils/benchmark_utils.py
+++ b/benchmark/opperf/utils/benchmark_utils.py
@@ -182,10 +182,6 @@ def run_performance_test(ops, inputs, run_backward=True,
 
 
 def run_op_benchmarks(ops, dtype, ctx, profiler, int64_tensor, warmup, runs):
-    # Running SoftmaxOutput backwards on GPU results in errors
-    # track issue here: https://github.com/apache/incubator-mxnet/issues/880
-    gpu_backwards_disabled_ops = ['SoftmaxOutput']
-
     # Running im2col either forwards or backwards on GPU results in errors
     # track issue here: https://github.com/apache/incubator-mxnet/issues/17493
     gpu_disabled_ops = ['im2col']
@@ -198,7 +194,7 @@ def run_op_benchmarks(ops, dtype, ctx, profiler, int64_tensor, warmup, runs):
             inputs = prepare_op_inputs(op, op_params, int64_tensor)
 
             # setting backward false for ops with known issue
-            if (ctx == mx.gpu() and op in gpu_backwards_disabled_ops) or op in no_backward:
+            if op in no_backward:
                 op_params["has_backward"] = False
 
             # Run benchmarks
diff --git a/benchmark/opperf/utils/op_registry_utils.py b/benchmark/opperf/utils/op_registry_utils.py
index 65eb6aab2aac..d3cf1a418334 100644
--- a/benchmark/opperf/utils/op_registry_utils.py
+++ b/benchmark/opperf/utils/op_registry_utils.py
@@ -121,8 +121,8 @@ def prepare_op_inputs(op, arg_params, int64_tensor):
     # For ops with args that need to change shape/value for different ops
     custom_data = {'Activation', 'LeakyReLU', 'Softmax', 'BilinearSampler', 'GridGenerator', 'sample_multinomial', 'linalg_maketrian',
                    'SpatialTransformer', 'col2im', 'GroupNorm', 'Dropout', 'FullyConnected',
-                   'SoftmaxOutput', 'LinearRegressionOutput', 'BatchNorm', 'LogisticRegressionOutput',
-                   'MAERegressionOutput', 'SVMOutput', 'L2Normalization', 'LayerNorm', 'InstanceNorm',
+                   'BatchNorm',
+                   'L2Normalization', 'LayerNorm', 'InstanceNorm',
                    'Embedding', 'Correlation', 'im2col', 'LRN', 'squeeze', 'fill_element_0index'}
 
     custom_data_int64 = {'random_pdf_dirichlet', 'random_pdf_exponential', 'random_pdf_gamma',
@@ -366,8 +366,8 @@ def get_all_nn_basic_operators():
     -------
     {"operator_name": {"has_backward", "nd_op_handle", "params"}}
     """
-    nn_basic_ops = ['FullyConnected', 'Dropout', 'BatchNorm', 'SoftmaxOutput', 'LinearRegressionOutput',
-                    'LogisticRegressionOutput', 'MAERegressionOutput', 'SVMOutput', 'L2Normalization',
+    nn_basic_ops = ['FullyConnected', 'Dropout', 'BatchNorm',
+                    'L2Normalization',
                     'LayerNorm', 'InstanceNorm', 'Embedding', 'Correlation', 'SpatialTransformer', 'im2col',
                     'col2im', 'GroupNorm', 'LRN']
 
diff --git a/ci/docker/runtime_functions.sh b/ci/docker/runtime_functions.sh
index 62412e092523..6b14a386fa09 100755
--- a/ci/docker/runtime_functions.sh
+++ b/ci/docker/runtime_functions.sh
@@ -491,10 +491,6 @@ build_ubuntu_cpu_cmake_asan() {
         -DMXNET_USE_CPU=ON \
         /work/mxnet
     make -j $(nproc) mxnet
-    # Disable leak detection but enable ASAN to link with ASAN but not fail with build tooling.
-    ASAN_OPTIONS=detect_leaks=0 \
-    LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libasan.so.5 \
-    make -j $(nproc) mlp_cpu
 }
 
 build_ubuntu_cpu_gcc8_werror() {
@@ -1170,21 +1166,6 @@ unittest_ubuntu_minimal_R() {
         R_LIBS=/tmp/r-site-library
 
     R CMD INSTALL --library=/tmp/r-site-library R-package
-    # pick mlp as minimal R test
-    R_LIBS=/tmp/r-site-library \
-        Rscript -e "library(mxnet); require(mlbench); \
-                    data(Sonar, package=\"mlbench\"); \
-                    Sonar[,61] = as.numeric(Sonar[,61])-1; \
-                    train.ind = c(1:50, 100:150); \
-                    train.x = data.matrix(Sonar[train.ind, 1:60]); \
-                    train.y = Sonar[train.ind, 61]; \
-                    test.x = data.matrix(Sonar[-train.ind, 1:60]); \
-                    test.y = Sonar[-train.ind, 61]; \
-                    model = mx.mlp(train.x, train.y, hidden_node = 10, \
-                                   out_node = 2, out_activation = \"softmax\", \
-                                   learning.rate = 0.1, \
-                                   array.layout = \"rowmajor\"); \
-                    preds = predict(model, test.x, array.layout = \"rowmajor\")"
 }
 
 unittest_ubuntu_gpu_R() {
@@ -1271,39 +1252,12 @@ integrationtest_ubuntu_cpu_onnx() {
 	pytest -n 4 tests/python/unittest/onnx/test_node.py
 }
 
-integrationtest_ubuntu_cpu_asan() {
-    set -ex
-    export DMLC_LOG_STACK_TRACE_DEPTH=10
-
-    cd /work/mxnet/build/cpp-package/example/
-    /work/mxnet/cpp-package/example/get_data.sh
-    ./mlp_cpu
-}
-
 integrationtest_ubuntu_gpu_cpp_package() {
     set -ex
     export DMLC_LOG_STACK_TRACE_DEPTH=10
     cpp-package/tests/ci_test.sh
 }
 
-integrationtest_ubuntu_gpu_capi_cpp_package() {
-    set -ex
-    export PYTHONPATH=./python/
-    export LD_LIBRARY_PATH=/work/mxnet/lib:$LD_LIBRARY_PATH
-    python3 -c "import mxnet as mx; mx.test_utils.download_model(\"imagenet1k-resnet-18\"); mx.test_utils.download_model(\"imagenet1k-resnet-152\"); mx.test_utils.download_model(\"imagenet1k-resnet-50\");"
-    # Load symbol, convert symbol to leverage fusion with subgraphs, save the model
-    python3 -c "import mxnet as mx; x = mx.sym.load(\"imagenet1k-resnet-152-symbol.json\"); x.get_backend_symbol(\"MKLDNN\"); x.save(\"imagenet1k-resnet-152-subgraph-symbol.json\");"
-    # Copy params file with a different name, used in subgraph symbol testing
-    cp imagenet1k-resnet-152-0000.params imagenet1k-resnet-152-subgraph-0000.params
-    build/tests/cpp/mxnet_unit_tests --gtest_filter="ThreadSafety.*"
-    build/tests/cpp/mxnet_unit_tests --gtest_filter="ThreadSafety.*" --thread-safety-with-cpu
-    # Also run thread safety tests in NaiveEngine mode
-    export MXNET_ENGINE_TYPE=NaiveEngine
-    build/tests/cpp/mxnet_unit_tests --gtest_filter="ThreadSafety.*"
-    build/tests/cpp/mxnet_unit_tests --gtest_filter="ThreadSafety.*" --thread-safety-with-cpu
-    unset MXNET_ENGINE_TYPE
-}
-
 integrationtest_ubuntu_cpu_dist_kvstore() {
     set -ex
     pushd .
diff --git a/ci/jenkins/Jenkins_steps.groovy b/ci/jenkins/Jenkins_steps.groovy
index ce97f4a03a74..c7c700210d15 100644
--- a/ci/jenkins/Jenkins_steps.groovy
+++ b/ci/jenkins/Jenkins_steps.groovy
@@ -984,20 +984,6 @@ def test_unix_cpp_package_gpu(lib_name) {
     }]
 }
 
-def test_unix_capi_cpp_package(lib_name) {
-    return ['capi-cpp-package GPU Makefile': {
-      node(NODE_LINUX_GPU_G4) {
-        ws('workspace/it-capi-cpp-package') {
-          timeout(time: max_time, unit: 'MINUTES') {
-            utils.unpack_and_init(lib_name, mx_lib_cpp_capi_make)
-            utils.docker_run('ubuntu_gpu_cu101', 'integrationtest_ubuntu_gpu_capi_cpp_package', true)
-            utils.publish_test_coverage()
-          }
-        }
-      }
-    }]
-}
-
 def test_unix_scala_cpu(lib_name) {
     return ['Scala: CPU Makefile': {
       node(NODE_LINUX_CPU) {
@@ -1693,17 +1679,6 @@ def docs_publish_beta() {
 }
 
 
-def misc_asan_cpu(lib_name) {
-    return ['CPU ASAN': {
-      node(NODE_LINUX_CPU) {
-        ws('workspace/ut-python3-cpu-asan') {
-            utils.unpack_and_init(lib_name, mx_lib_cpp_examples_cpu)
-            utils.docker_run('ubuntu_cpu', 'integrationtest_ubuntu_cpu_asan', false)
-        }
-      }
-    }]
-}
-
 def sanity_lint() {
     return ['Lint': {
       node(NODE_LINUX_CPU) {
diff --git a/ci/jenkins/Jenkinsfile_miscellaneous b/ci/jenkins/Jenkinsfile_miscellaneous
index d6376a72ddbf..eb6b632cdbd3 100644
--- a/ci/jenkins/Jenkinsfile_miscellaneous
+++ b/ci/jenkins/Jenkinsfile_miscellaneous
@@ -44,7 +44,6 @@ core_logic: {
   ])
 
   utils.parallel_stage('Tests', [
-    custom_steps.misc_asan_cpu('cpu_asan'),
     custom_steps.misc_test_docker_cache_build()
   ])
 }
diff --git a/ci/jenkins/Jenkinsfile_unix_gpu b/ci/jenkins/Jenkinsfile_unix_gpu
index 33c0d4daf580..d662a95e014f 100644
--- a/ci/jenkins/Jenkinsfile_unix_gpu
+++ b/ci/jenkins/Jenkinsfile_unix_gpu
@@ -60,7 +60,6 @@ core_logic: {
     // TODO(szha): fix and reenable the hanging issue. tracked in #18098
     // custom_steps.test_unix_distributed_kvstore_gpu('gpu'),
     custom_steps.test_unix_byteps_gpu('gpu'),
-    custom_steps.test_unix_capi_cpp_package('gpu_mkldnn_cpp_test_make'),
   ]) 
 }
 ,
diff --git a/ci/windows/test_py3_gpu.ps1 b/ci/windows/test_py3_gpu.ps1
index 9f200f31b540..5dbc6fc28755 100644
--- a/ci/windows/test_py3_gpu.ps1
+++ b/ci/windows/test_py3_gpu.ps1
@@ -35,11 +35,6 @@ if ($LastExitCode -ne 0) { Throw ("Error running parallel tests, python exited w
 C:\Python37\python.exe -m pytest -v -m 'serial' --durations=50 --cov-report xml:tests_operator.xml --cov-append tests\python\gpu\test_operator_gpu.py
 if ($LastExitCode -ne 0) { Throw ("Error running serial tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) }
 
-C:\Python37\python.exe -m pytest -v -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_forward.xml tests\python\gpu\test_forward.py
-if ($LastExitCode -ne 0) { Throw ("Error running parallel tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) }
-C:\Python37\python.exe -m pytest -v -m 'serial' --durations=50 --cov-report xml:tests_forward.xml --cov-append tests\python\gpu\test_forward.py
-if ($LastExitCode -ne 0) { Throw ("Error running serial tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) }
-
 C:\Python37\python.exe -m pytest -v -m 'not serial' -n 4 --durations=50 --cov-report xml:tests_train.xml tests\python\train
 if ($LastExitCode -ne 0) { Throw ("Error running parallel tests, python exited with status code " + ('{0:X}' -f $LastExitCode)) }
 C:\Python37\python.exe -m pytest -v -m 'serial' --durations=50 --cov-report xml:tests_train.xml --cov-append tests\python\train
diff --git a/contrib/clojure-package/examples/bert/.gitignore b/contrib/clojure-package/examples/bert/.gitignore
deleted file mode 100644
index 70c55267e7ab..000000000000
--- a/contrib/clojure-package/examples/bert/.gitignore
+++ /dev/null
@@ -1,18 +0,0 @@
-/target
-/classes
-/checkouts
-pom.xml
-pom.xml.asc
-*.jar
-*.class
-/.lein-*
-/.nrepl-port
-.hgignore
-.hg/
-data/*
-model/*
-*~
-*.params
-*.states
-*.json
-
diff --git a/contrib/clojure-package/examples/bert/README.md b/contrib/clojure-package/examples/bert/README.md
deleted file mode 100644
index 0681a0694fc8..000000000000
--- a/contrib/clojure-package/examples/bert/README.md
+++ /dev/null
@@ -1,163 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-
-# BERT
-
-There are two examples showcasing the power of BERT. One is BERT-QA for inference and the other is BERT Sentence Pair Classification which uses fine tuning of the BERT base model. For more information about BERT please read [http://jalammar.github.io/illustrated-bert/](http://jalammar.github.io/illustrated-bert/).
-
-## bert-qa
-
-**This example was based off of the Java API one. It shows how to do inference with a pre-trained BERT network that is trained on Questions and Answers using the [SQuAD Dataset](https://rajpurkar.github.io/SQuAD-explorer/)**
-
-The pretrained model was created using GluonNLP and then exported to the MXNet symbol format. You can find more information in the background section below.
-
-In this tutorial, we will walk through the BERT QA model trained by MXNet. 
-Users can provide a question with a paragraph contains answer to the model and
-the model will be able to find the best answer from the answer paragraph.
-
-Example:
-
-```
-{:input-answer "Steam engines are external combustion engines, where the working fluid is separate from the combustion products. Non-combustion heat sources such as solar power, nuclear power or geothermal energy may be used. The ideal thermodynamic cycle used to analyze this process is called the Rankine cycle. In the cycle, water is heated and transforms into steam within a boiler operating at a high pressure. When expanded through pistons or turbines, mechanical work is done. The reduced-pressure steam is then condensed and pumped back into the boiler."
-  :input-question "Along with geothermal and nuclear, what is a notable non-combustion heat source?"
-  :ground-truth-answers ["solar"
-                         "solar power"
-                         "solar power, nuclear power or geothermal energy solar"]}
-```
-
-The prediction in this case would be `solar power`
-
-### Setup Guide
-
-Note: If you have trouble with your REPL and cider, please comment out the `lein-jupyter` plugin. There are some conflicts with cider.
-
-#### Step 1: Download the model
-
-For this tutorial, you can get the model and vocabulary by running following bash file. This script will use `wget` to download these artifacts from AWS S3.
-
-From the example directory:
-
-```bash
-./get_bert_data.sh
-```
-
-Some sample questions and answers are provide in the `squad-sample.edn` file. Some are taken directly from the SQuAD dataset and one was just made up. Feel free to edit the file and add your own!
-
-
-### To run
-
-* `lein install` in the root of the main project directory
-* cd into this project directory and do `lein run`. This will execute the cpu version.
-  * `lein run` or `lein run :cpu` to run with cpu
-  * `lein run :gpu` to run with gpu
-
-### Background
-
-To learn more about how BERT works in MXNet, please follow this [MXNet Gluon tutorial on NLP using BERT](https://medium.com/apache-mxnet/gluon-nlp-bert-6a489bdd3340).
-
-The model was extracted from MXNet GluonNLP with static length settings.
-
-[Download link for the script](https://gluon-nlp.mxnet.io/_downloads/bert.zip)
-
-The original description can be found in the [MXNet GluonNLP model zoo](https://gluon-nlp.mxnet.io/model_zoo/bert/index.html#bert-base-on-squad-1-1).
-```bash
-python static_finetune_squad.py --optimizer adam --accumulate 2 --batch_size 6 --lr 3e-5 --epochs 2 --gpu 0 --export
-
-```
-This script will generate `json` and `param` files that are the standard MXNet model files.
-By default, this model are using `bert_12_768_12` model with extra layers for QA jobs.
-
-After that, to be able to use it in Java, we need to export the dictionary from the script to parse the text
-to actual indexes. Please add the following lines after [this line](https://github.com/dmlc/gluon-nlp/blob/master/scripts/bert/staticbert/static_finetune_squad.py#L262).
-```python
-import json
-json_str = vocab.to_json()
-f = open("vocab.json", "w")
-f.write(json_str)
-f.close()
-```
-This would export the token vocabulary in json format.
-Once you have these three files, you will be able to run this example without problems.
-
-## Fine-tuning Sentence Pair Classification with BERT
-
-This was based off of the great tutorial for in Gluon-NLP [https://gluon-nlp.mxnet.io/examples/sentence_embedding/bert.html](https://gluon-nlp.mxnet.io/examples/sentence_embedding/bert.html).
-
-We use the pre-trained BERT model that was exported from GluonNLP via the `scripts/bert/staticbert/static_export_base.py` running `python static_export_base.py --seq_length 128`. For convenience, the model has been downloaded for you by running the get_bert_data.sh file in the root directory of this example.
-
-It will fine tune the base bert model for use in a classification task for 3 epochs.
-
-
-### Setup Guide
-
-
-## Installation
-
-Before you run this example, make sure that you have the clojure package installed.
-In the main clojure package directory, do `lein install`. Then you can run
-`lein install` in this directory.
-
-#### Step 1: Download the model
-
-For this tutorial, you can get the model and vocabulary by running following bash file. This script will use `wget` to download these artifacts from AWS S3.
-
-From the example directory:
-
-```bash
-./get_bert_data.sh
-```
-
-### To run the notebook walkthrough
-
-There is a Jupyter notebook that uses the `lein jupyter` plugin to be able to execute Clojure code in project setting. The first time that you run it you will need to install the kernel with `lein jupyter install-kernel`. After that you can open the notebook in the project directory with `lein jupyter notebook`.
-
-There is also an exported copy of the walkthrough to markdown `fine-tune-bert.md`.
-
-
-### To run
-
-* `lein install` in the root of the main project directory
-* cd into this project directory and do `lein run`. This will execute the cpu version.
-
-`lein run -m bert.bert-sentence-classification :cpu` - to run with cpu
-`lein run -m bert.bert-sentence-classification :gpu` - to run with gpu
-
-By default it will run 3 epochs, you can control the number of epochs with:
-
-`lein run -m bert.bert-sentence-classification :cpu 1` to run just 1 epoch
-
-
-Sample results from cpu run on OSX
-```
-INFO  org.apache.mxnet.module.BaseModule: Epoch[1] Train-accuracy=0.65384614
-INFO  org.apache.mxnet.module.BaseModule: Epoch[1] Time cost=464187
-INFO  org.apache.mxnet.Callback$Speedometer: Epoch[2] Batch [1]	Speed: 0.91 samples/sec	Train-accuracy=0.656250
-INFO  org.apache.mxnet.Callback$Speedometer: Epoch[2] Batch [2]	Speed: 0.90 samples/sec	Train-accuracy=0.656250
-INFO  org.apache.mxnet.Callback$Speedometer: Epoch[2] Batch [3]	Speed: 0.91 samples/sec	Train-accuracy=0.687500
-INFO  org.apache.mxnet.Callback$Speedometer: Epoch[2] Batch [4]	Speed: 0.90 samples/sec	Train-accuracy=0.693750
-INFO  org.apache.mxnet.Callback$Speedometer: Epoch[2] Batch [5]	Speed: 0.91 samples/sec	Train-accuracy=0.703125
-INFO  org.apache.mxnet.Callback$Speedometer: Epoch[2] Batch [6]	Speed: 0.92 samples/sec	Train-accuracy=0.696429
-INFO  org.apache.mxnet.Callback$Speedometer: Epoch[2] Batch [7]	Speed: 0.91 samples/sec	Train-accuracy=0.699219
-INFO  org.apache.mxnet.Callback$Speedometer: Epoch[2] Batch [8]	Speed: 0.90 samples/sec	Train-accuracy=0.701389
-INFO  org.apache.mxnet.Callback$Speedometer: Epoch[2] Batch [9]	Speed: 0.90 samples/sec	Train-accuracy=0.690625
-INFO  org.apache.mxnet.Callback$Speedometer: Epoch[2] Batch [10]	Speed: 0.89 samples/sec	Train-accuracy=0.690341
-INFO  org.apache.mxnet.Callback$Speedometer: Epoch[2] Batch [11]	Speed: 0.90 samples/sec	Train-accuracy=0.695313
-INFO  org.apache.mxnet.Callback$Speedometer: Epoch[2] Batch [12]	Speed: 0.91 samples/sec	Train-accuracy=0.701923
-INFO  org.apache.mxnet.module.BaseModule: Epoch[2] Train-accuracy=0.7019231
-INFO  org.apache.mxnet.module.BaseModule: Epoch[2] Time cost=459809
-````
diff --git a/contrib/clojure-package/examples/bert/fine-tune-bert.ipynb b/contrib/clojure-package/examples/bert/fine-tune-bert.ipynb
deleted file mode 100644
index 5934477ea338..000000000000
--- a/contrib/clojure-package/examples/bert/fine-tune-bert.ipynb
+++ /dev/null
@@ -1,629 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Fine-tuning Sentence Pair Classification with BERT\n",
-    "\n",
-    "**This tutorial is based off of the Gluon NLP one here https://gluon-nlp.mxnet.io/examples/sentence_embedding/bert.html**\n",
-    "\n",
-    "Pre-trained language representations have been shown to improve many downstream NLP tasks such as question answering, and natural language inference. To apply pre-trained representations to these tasks, there are two strategies:\n",
-    "\n",
-    " - **feature-based approach**, which uses the pre-trained representations as additional features to the downstream task.\n",
-    " - **fine-tuning based approach**, which trains the downstream tasks by fine-tuning pre-trained parameters.\n",
-    " \n",
-    "While feature-based approaches such as ELMo [1] are effective in improving many downstream tasks, they require task-specific architectures. Devlin, Jacob, et al proposed BERT [2] (Bidirectional Encoder Representations from Transformers), which fine-tunes deep bidirectional representations on a wide range of tasks with minimal task-specific parameters, and obtained state- of-the-art results.\n",
-    "\n",
-    "In this tutorial, we will focus on fine-tuning with the pre-trained BERT model to classify semantically equivalent sentence pairs. Specifically, we will:\n",
-    "\n",
-    " 1. load the state-of-the-art pre-trained BERT model and attach an additional layer for classification\n",
-    " 2. process and transform sentence pair data for the task at hand, and \n",
-    " 3. fine-tune BERT model for sentence classification.\n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Preparation\n",
-    "\n",
-    "To run this tutorial locally, in the example directory:\n",
-    "\n",
-    "1. Get the model and supporting data by running `get_bert_data.sh`. \n",
-    "2. This Jupyter Notebook uses the lein-jupyter plugin to be able to execute Clojure code in project setting. The first time that you run it you will need to install the kernel with`lein jupyter install-kernel`. After that you can open the notebook in the project directory with `lein jupyter notebook`."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Load requirements\n",
-    "\n",
-    "We need to load up all the namespace requires"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 1,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [],
-   "source": [
-    "(ns bert.bert-sentence-classification\n",
-    "  (:require [bert.util :as bert-util]\n",
-    "            [clojure-csv.core :as csv]\n",
-    "            [clojure.java.shell :refer [sh]]\n",
-    "            [clojure.string :as string]\n",
-    "            [org.apache.clojure-mxnet.callback :as callback]\n",
-    "            [org.apache.clojure-mxnet.context :as context]\n",
-    "            [org.apache.clojure-mxnet.dtype :as dtype]\n",
-    "            [org.apache.clojure-mxnet.infer :as infer]\n",
-    "            [org.apache.clojure-mxnet.eval-metric :as eval-metric]\n",
-    "            [org.apache.clojure-mxnet.io :as mx-io]\n",
-    "            [org.apache.clojure-mxnet.layout :as layout]\n",
-    "            [org.apache.clojure-mxnet.module :as m]\n",
-    "            [org.apache.clojure-mxnet.ndarray :as ndarray]\n",
-    "            [org.apache.clojure-mxnet.optimizer :as optimizer]\n",
-    "            [org.apache.clojure-mxnet.symbol :as sym]))\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "collapsed": true
-   },
-   "source": [
-    "# Use the Pre-trained BERT Model\n",
-    "\n",
-    "In this tutorial we will use the pre-trained BERT model that was exported from GluonNLP via the `scripts/bert/staticbert/static_export_base.py`. For convenience, the model has been downloaded for you by running  the `get_bert_data.sh` file in the root directory of this example."
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Get BERT\n",
-    "\n",
-    "Let’s first take a look at the BERT model architecture for sentence pair classification below:\n",
-    "\n",
-    "![bert](https://gluon-nlp.mxnet.io/_images/bert-sentence-pair.png)\n",
-    "\n",
-    "where the model takes a pair of sequences and *pools* the representation of the first token in the sequence. Note that the original BERT model was trained for masked language model and next sentence prediction tasks, which includes layers for language model decoding and classification. These layers will not be used for fine-tuning sentence pair classification.\n",
-    "\n",
-    "Let's load the pre-trained BERT using the module API in MXNet."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "#'bert.bert-sentence-classification/bert-base"
-      ]
-     },
-     "execution_count": 2,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "(def model-path-prefix \"data/static_bert_base_net\")\n",
-    "\n",
-    ";; the vocabulary used in the model\n",
-    "(def vocab (bert-util/get-vocab))\n",
-    "\n",
-    ";; the maximum length of the sequence\n",
-    "(def seq-length 128)\n",
-    "\n",
-    "(def batch-size 32)\n",
-    "\n",
-    "(def bert-base (m/load-checkpoint {:prefix model-path-prefix :epoch 0}))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## Model Definition for Sentence Pair Classification\n",
-    "\n",
-    "Now that we have loaded the BERT model, we only need to attach an additional layer for classification. We can do this by defining a fine tune model from the symbol of the base BERT model."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "#'bert.bert-sentence-classification/model-sym"
-      ]
-     },
-     "execution_count": 3,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "(defn fine-tune-model\n",
-    "  \"msymbol: the pretrained network symbol\n",
-    "   num-classes: the number of classes for the fine-tune datasets\n",
-    "   dropout: the dropout rate\"\n",
-    "  [msymbol {:keys [num-classes dropout]}]\n",
-    "  (as-> msymbol data\n",
-    "    (sym/dropout {:data data :p dropout})\n",
-    "    (sym/fully-connected \"fc-finetune\" {:data data :num-hidden num-classes})\n",
-    "    (sym/softmax-output \"softmax\" {:data data})))\n",
-    "\n",
-    "(def model-sym (fine-tune-model (m/symbol bert-base) {:num-classes 2 :dropout 0.1}))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Data Preprocessing for BERT\n",
-    "\n",
-    "## Dataset\n",
-    "\n",
-    "For demonstration purpose, we use the dev set of the Microsoft Research Paraphrase Corpus dataset. The file is named ‘dev.tsv’ and was downloaded as part of the data script. Let’s take a look at the raw dataset."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 4,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "﻿Quality\t#1 ID\t#2 ID\t#1 String\t#2 String\n",
-      "1\t1355540\t1355592\tHe said the foodservice pie business doesn 't fit the company 's long-term growth strategy .\t\" The foodservice pie business does not fit our long-term growth strategy .\n",
-      "0\t2029631\t2029565\tMagnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .\tHis wife said he was \" 100 percent behind George Bush \" and looked forward to using his years of training in the war .\n",
-      "0\t487993\t487952\tThe dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .\tThe dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .\n",
-      "1\t1989515\t1989458\tThe AFL-CIO is waiting until October to decide if it will endorse a candidate .\tThe AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .\n",
-      "\n"
-     ]
-    }
-   ],
-   "source": [
-    "(-> (sh \"head\" \"-n\" \"5\" \"data/dev.tsv\") \n",
-    "    :out\n",
-    "    println)"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "The file contains 5 columns, separated by tabs (i.e. ‘\n",
-    "\n",
-    "\\t\n",
-    "‘). The first line of the file explains each of these columns: 0. the label indicating whether the two sentences are semantically equivalent 1. the id of the first sentence in this sample 2. the id of the second sentence in this sample 3. the content of the first sentence 4. the content of the second sentence\n",
-    "\n",
-    "For our task, we are interested in the 0th, 3rd and 4th columns. "
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .\n",
-      " The foodservice pie business does not fit our long-term growth strategy .\n",
-      "1\n"
-     ]
-    }
-   ],
-   "source": [
-    "(def raw-file \n",
-    "    (csv/parse-csv (string/replace (slurp \"data/dev.tsv\") \"\\\"\" \"\")\n",
-    "                   :delimiter \\tab\n",
-    "                   :strict true))\n",
-    "\n",
-    "(def data-train-raw (->> raw-file\n",
-    "                         (mapv #(vals (select-keys % [3 4 0])))\n",
-    "                         (rest) ; drop header\n",
-    "                         (into [])))\n",
-    "\n",
-    "(def sample (first data-train-raw))\n",
-    "(println (nth sample 0)) ;;;sentence a\n",
-    "(println (nth sample 1)) ;; sentence b\n",
-    "(println (nth sample 2)) ;; 1 means equivalent, 0 means not equivalent"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "To use the pre-trained BERT model, we need to preprocess the data in the same way it was trained. The following figure shows the input representation in BERT:\n",
-    "\n",
-    "![bert-input](https://gluon-nlp.mxnet.io/_images/bert-embed.png)\n",
-    "\n",
-    "We will do pre-processing on the inputs to get them in the right format and to perform the following transformations:\n",
-    "- tokenize the input sequences\n",
-    "- insert [CLS] at the beginning\n",
-    "- insert [SEP] between sentence one and sentence two, and at the end - generate segment ids to indicate whether a token belongs to the first sequence or the second sequence.\n",
-    "- generate valid length"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 6,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Train Count is =  408\n",
-      "[PAD] token id =  1\n",
-      "[CLS] token id =  2\n",
-      "[SEP] token id =  3\n",
-      "token ids = \n",
-      " [2 2002 2056 1996 0 11345 2449 2987 0 4906 1996 2194 0 0 3930 5656 0 1012 3 0 1996 0 11345 2449 2515 2025 4906 2256 0 3930 5656 0 1012 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]\n",
-      "segment ids = \n",
-      " [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]\n",
-      "valid length = \n",
-      " [31]\n",
-      "label = \n",
-      " [0]\n"
-     ]
-    }
-   ],
-   "source": [
-    "(defn pre-processing\n",
-    "  \"Preprocesses the sentences in the format that BERT is expecting\"\n",
-    "  [idx->token token->idx train-item]\n",
-    "    (let [[sentence-a sentence-b label] train-item\n",
-    "       ;;; pre-processing tokenize sentence\n",
-    "          token-1 (bert-util/tokenize (string/lower-case sentence-a))\n",
-    "          token-2 (bert-util/tokenize (string/lower-case sentence-b))\n",
-    "          valid-length (+ (count token-1) (count token-2))\n",
-    "        ;;; generate token types [0000...1111...0000]\n",
-    "          qa-embedded (into (bert-util/pad [] 0 (count token-1))\n",
-    "                            (bert-util/pad [] 1 (count token-2)))\n",
-    "          token-types (bert-util/pad qa-embedded 0 seq-length)\n",
-    "        ;;; make BERT pre-processing standard\n",
-    "          token-2 (conj token-2 \"[SEP]\")\n",
-    "          token-1 (into [] (concat [\"[CLS]\"] token-1 [\"[SEP]\"] token-2))\n",
-    "          tokens (bert-util/pad token-1 \"[PAD]\" seq-length)\n",
-    "        ;;; pre-processing - token to index translation\n",
-    "          indexes (bert-util/tokens->idxs token->idx tokens)]\n",
-    "    {:input-batch [indexes\n",
-    "                   token-types\n",
-    "                   [valid-length]]\n",
-    "     :label (if (= \"0\" label)\n",
-    "              [0]\n",
-    "              [1])\n",
-    "     :tokens tokens\n",
-    "     :train-item train-item}))\n",
-    "\n",
-    "(def idx->token (:idx->token vocab))\n",
-    "(def token->idx (:token->idx vocab))\n",
-    "(def dev (context/default-context))\n",
-    "(def processed-datas (mapv #(pre-processing idx->token token->idx %) data-train-raw))\n",
-    "(def train-count (count processed-datas))\n",
-    "(println \"Train Count is = \" train-count)\n",
-    "(println \"[PAD] token id = \" (get token->idx \"[PAD]\"))\n",
-    "(println \"[CLS] token id = \" (get token->idx \"[CLS]\"))\n",
-    "(println \"[SEP] token id = \" (get token->idx \"[SEP]\"))\n",
-    "(println \"token ids = \\n\"(-> (first processed-datas) :input-batch first)) \n",
-    "(println \"segment ids = \\n\"(-> (first processed-datas) :input-batch second)) \n",
-    "(println \"valid length = \\n\" (-> (first processed-datas) :input-batch last)) \n",
-    "(println \"label = \\n\" (-> (second processed-datas) :label)) \n",
-    "\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now that we have all the input-batches for each row, we are going to slice them up column-wise and create NDArray Iterators that we can use in training"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 7,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "#object[org.apache.mxnet.io.NDArrayIter 0x2583097d \"non-empty iterator\"]"
-      ]
-     },
-     "execution_count": 7,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "(defn slice-inputs-data\n",
-    "  \"Each sentence pair had to be processed as a row. This breaks all\n",
-    "  the rows up into a column for creating a NDArray\"\n",
-    "  [processed-datas n]\n",
-    "  (->> processed-datas\n",
-    "       (mapv #(nth (:input-batch %) n))\n",
-    "       (flatten)\n",
-    "       (into [])))\n",
-    "\n",
-    "(def prepared-data {:data0s (slice-inputs-data processed-datas 0)\n",
-    "                    :data1s (slice-inputs-data processed-datas 1)\n",
-    "                    :data2s (slice-inputs-data processed-datas 2)\n",
-    "                    :labels (->> (mapv :label processed-datas)\n",
-    "                                 (flatten)\n",
-    "                                 (into []))\n",
-    "                    :train-num (count processed-datas)})\n",
-    "\n",
-    "(def train-data\n",
-    "  (let [{:keys [data0s data1s data2s labels train-num]} prepared-data\n",
-    "        data-desc0 (mx-io/data-desc {:name \"data0\"\n",
-    "                                     :shape [train-num seq-length]\n",
-    "                                     :dtype dtype/FLOAT32\n",
-    "                                     :layout layout/NT})\n",
-    "        data-desc1 (mx-io/data-desc {:name \"data1\"\n",
-    "                                     :shape [train-num seq-length]\n",
-    "                                     :dtype dtype/FLOAT32\n",
-    "                                     :layout layout/NT})\n",
-    "        data-desc2 (mx-io/data-desc {:name \"data2\"\n",
-    "                                     :shape [train-num]\n",
-    "                                     :dtype dtype/FLOAT32\n",
-    "                                     :layout layout/N})\n",
-    "        label-desc (mx-io/data-desc {:name \"softmax_label\"\n",
-    "                                     :shape [train-num]\n",
-    "                                     :dtype dtype/FLOAT32\n",
-    "                                     :layout layout/N})]\n",
-    "    (mx-io/ndarray-iter {data-desc0 (ndarray/array data0s [train-num seq-length]\n",
-    "                                                   {:ctx dev})\n",
-    "                         data-desc1 (ndarray/array data1s [train-num seq-length]\n",
-    "                                                   {:ctx dev})\n",
-    "                         data-desc2 (ndarray/array data2s [train-num]\n",
-    "                                                   {:ctx dev})}\n",
-    "                        {:label {label-desc (ndarray/array labels [train-num]\n",
-    "                                                           {:ctx dev})}\n",
-    "                         :data-batch-size batch-size})))\n",
-    "train-data"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "# Fine-tune BERT Model\n",
-    "\n",
-    "Putting everything together, now we can fine-tune the model with a few epochs. For demonstration, we use a fixed learning rate and skip validation steps."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 8,
-   "metadata": {
-    "collapsed": false
-   },
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Speedometer: epoch  0  count  1  metric  [accuracy 0.609375]\n",
-      "Speedometer: epoch  0  count  2  metric  [accuracy 0.6041667]\n",
-      "Speedometer: epoch  0  count  3  metric  [accuracy 0.5703125]\n",
-      "Speedometer: epoch  0  count  4  metric  [accuracy 0.55625]\n",
-      "Speedometer: epoch  0  count  5  metric  [accuracy 0.5625]\n",
-      "Speedometer: epoch  0  count  6  metric  [accuracy 0.55803573]\n",
-      "Speedometer: epoch  0  count  7  metric  [accuracy 0.5625]\n",
-      "Speedometer: epoch  0  count  8  metric  [accuracy 0.5798611]\n",
-      "Speedometer: epoch  0  count  9  metric  [accuracy 0.584375]\n",
-      "Speedometer: epoch  0  count  10  metric  [accuracy 0.57670456]\n",
-      "Speedometer: epoch  0  count  11  metric  [accuracy 0.5807292]\n",
-      "Speedometer: epoch  0  count  12  metric  [accuracy 0.5793269]\n",
-      "Speedometer: epoch  1  count  1  metric  [accuracy 0.5625]\n",
-      "Speedometer: epoch  1  count  2  metric  [accuracy 0.5520833]\n",
-      "Speedometer: epoch  1  count  3  metric  [accuracy 0.5859375]\n",
-      "Speedometer: epoch  1  count  4  metric  [accuracy 0.59375]\n",
-      "Speedometer: epoch  1  count  5  metric  [accuracy 0.6145833]\n",
-      "Speedometer: epoch  1  count  6  metric  [accuracy 0.625]\n",
-      "Speedometer: epoch  1  count  7  metric  [accuracy 0.640625]\n",
-      "Speedometer: epoch  1  count  8  metric  [accuracy 0.6527778]\n",
-      "Speedometer: epoch  1  count  9  metric  [accuracy 0.653125]\n",
-      "Speedometer: epoch  1  count  10  metric  [accuracy 0.6448864]\n",
-      "Speedometer: epoch  1  count  11  metric  [accuracy 0.640625]\n",
-      "Speedometer: epoch  1  count  12  metric  [accuracy 0.6418269]\n",
-      "Speedometer: epoch  2  count  1  metric  [accuracy 0.671875]\n",
-      "Speedometer: epoch  2  count  2  metric  [accuracy 0.7083333]\n",
-      "Speedometer: epoch  2  count  3  metric  [accuracy 0.7109375]\n",
-      "Speedometer: epoch  2  count  4  metric  [accuracy 0.725]\n",
-      "Speedometer: epoch  2  count  5  metric  [accuracy 0.7239583]\n",
-      "Speedometer: epoch  2  count  6  metric  [accuracy 0.71875]\n",
-      "Speedometer: epoch  2  count  7  metric  [accuracy 0.734375]\n",
-      "Speedometer: epoch  2  count  8  metric  [accuracy 0.7361111]\n",
-      "Speedometer: epoch  2  count  9  metric  [accuracy 0.721875]\n",
-      "Speedometer: epoch  2  count  10  metric  [accuracy 0.71022725]\n",
-      "Speedometer: epoch  2  count  11  metric  [accuracy 0.6979167]\n",
-      "Speedometer: epoch  2  count  12  metric  [accuracy 0.7019231]\n"
-     ]
-    },
-    {
-     "data": {
-      "text/plain": [
-       "#object[org.apache.mxnet.module.Module 0x73c42ae5 \"org.apache.mxnet.module.Module@73c42ae5\"]"
-      ]
-     },
-     "execution_count": 8,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "(def num-epoch 3)\n",
-    "\n",
-    "(def fine-tune-model (m/module model-sym {:contexts [dev]\n",
-    "                                          :data-names [\"data0\" \"data1\" \"data2\"]}))\n",
-    "\n",
-    "(m/fit fine-tune-model {:train-data train-data  :num-epoch num-epoch\n",
-    "                        :fit-params (m/fit-params {:allow-missing true\n",
-    "                                                   :arg-params (m/arg-params bert-base)\n",
-    "                                                   :aux-params (m/aux-params bert-base)\n",
-    "                                                   :optimizer (optimizer/adam {:learning-rate 5e-6 :episilon 1e-9})\n",
-    "                                                   :batch-end-callback (callback/speedometer batch-size 1)})})\n"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "### Explore results from the fine-tuned model\n",
-    "\n",
-    "Now that our model is fitted, we can use it to infer semantic equivalence of arbitrary sentence pairs. Note that for demonstration purpose we skipped the warmup learning rate schedule and validation on dev dataset used in the original implementation. This means that our model's performance will be significantly less than optimal. Please visit [here](https://gluon-nlp.mxnet.io/model_zoo/bert/index.html) for the complete fine-tuning scripts (using Python and GluonNLP).\n",
-    "\n",
-    "To do inference with our model we need a predictor. It must have a batch size of 1 so we can feed the model a single sentence pair."
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 14,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "#'bert.bert-sentence-classification/fine-tuned-predictor"
-      ]
-     },
-     "execution_count": 14,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "(def fine-tuned-prefix \"fine-tune-sentence-bert\")\n",
-    "\n",
-    "(m/save-checkpoint fine-tune-model {:prefix fine-tuned-prefix :epoch 3})\n",
-    "\n",
-    "(def fine-tuned-predictor\n",
-    "    (infer/create-predictor (infer/model-factory fine-tuned-prefix\n",
-    "                                                 [{:name \"data0\" :shape [1 seq-length] :dtype dtype/FLOAT32 :layout layout/NT}\n",
-    "                                                  {:name \"data1\" :shape [1 seq-length] :dtype dtype/FLOAT32 :layout layout/NT}\n",
-    "                                                  {:name \"data2\" :shape [1]            :dtype dtype/FLOAT32 :layout layout/N}])\n",
-    "                            {:epoch 3}))"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "Now we can write a function that feeds a sentence pair to the fine-tuned model:"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 15,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "#'bert.bert-sentence-classification/predict-equivalence"
-      ]
-     },
-     "execution_count": 15,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "(defn predict-equivalence\n",
-    "    [predictor sentence1 sentence2]\n",
-    "    (let [vocab (bert.util/get-vocab)\n",
-    "          processed-test-data (mapv #(pre-processing (:idx->token vocab)\n",
-    "                                                     (:token->idx vocab) %)\n",
-    "                                    [[sentence1 sentence2]])\n",
-    "          prediction (infer/predict-with-ndarray predictor\n",
-    "                                                 [(ndarray/array (slice-inputs-data processed-test-data 0) [1 seq-length])\n",
-    "                                                  (ndarray/array (slice-inputs-data processed-test-data 1) [1 seq-length])\n",
-    "                                                  (ndarray/array (slice-inputs-data processed-test-data 2) [1])])]\n",
-    "      (ndarray/->vec (first prediction))))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "[0.2633881 0.7366119]"
-      ]
-     },
-     "execution_count": 22,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    ";; Modify an existing sentence pair to test:\n",
-    ";; [\"1\"\n",
-    ";;  \"69773\"\n",
-    ";;  \"69792\"\n",
-    ";;  \"Cisco pared spending to compensate for sluggish sales .\"\n",
-    ";;  \"In response to sluggish sales , Cisco pared spending .\"]\n",
-    "(predict-equivalence fine-tuned-predictor\n",
-    "                     \"The company cut spending to compensate for weak sales .\"\n",
-    "                     \"In response to poor sales results, the company cut spending .\")"
-   ]
-  },
-  {
-   "cell_type": "markdown",
-   "metadata": {},
-   "source": [
-    "## References\n",
-    "\n",
-    "[1] Peters, Matthew E., et al. “Deep contextualized word representations.” arXiv preprint arXiv:1802.05365 (2018).\n",
-    "\n",
-    "[2] Devlin, Jacob, et al. “Bert: Pre-training of deep bidirectional transformers for language understanding.” arXiv preprint arXiv:1810.04805 (2018)."
-   ]
-  }
- ],
- "metadata": {
-  "anaconda-cloud": {},
-  "kernelspec": {
-   "display_name": "Lein-Clojure",
-   "language": "clojure",
-   "name": "lein-clojure"
-  },
-  "language_info": {
-   "file_extension": ".clj",
-   "mimetype": "text/x-clojure",
-   "name": "clojure",
-   "version": "1.9.0"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 1
-}
diff --git a/contrib/clojure-package/examples/bert/fine-tune-bert.md b/contrib/clojure-package/examples/bert/fine-tune-bert.md
deleted file mode 100644
index 4e6681e7aade..000000000000
--- a/contrib/clojure-package/examples/bert/fine-tune-bert.md
+++ /dev/null
@@ -1,371 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-
-# Fine-tuning Sentence Pair Classification with BERT
-
-**This tutorial is based off of the Gluon NLP one here https://gluon-nlp.mxnet.io/examples/sentence_embedding/bert.html**
-
-Pre-trained language representations have been shown to improve many downstream NLP tasks such as question answering, and natural language inference. To apply pre-trained representations to these tasks, there are two strategies:
-
-feature-based approach, which uses the pre-trained representations as additional features to the downstream task.
-fine-tuning based approach, which trains the downstream tasks by fine-tuning pre-trained parameters.
-While feature-based approaches such as ELMo [3] (introduced in the previous tutorial) are effective in improving many downstream tasks, they require task-specific architectures. Devlin, Jacob, et al proposed BERT [1] (Bidirectional Encoder Representations from Transformers), which fine-tunes deep bidirectional representations on a wide range of tasks with minimal task-specific parameters, and obtained state- of-the-art results.
-
-In this tutorial, we will focus on fine-tuning with the pre-trained BERT model to classify semantically equivalent sentence pairs. Specifically, we will:
-
-load the state-of-the-art pre-trained BERT model and attach an additional layer for classification,
-process and transform sentence pair data for the task at hand, and
-fine-tune BERT model for sentence classification.
-
-
-
-## Preparation
-
-To run this tutorial locally, in the example directory:
-
-1. Get the model and supporting data by running `get_bert_data.sh`. 
-2. This Jupyter Notebook uses the lein-jupyter plugin to be able to execute Clojure code in project setting. The first time that you run it you will need to install the kernel with`lein jupyter install-kernel`. After that you can open the notebook in the project directory with `lein jupyter notebook`.
-
-## Load requirements
-
-We need to load up all the namespace requires
-
-
-```clojure
-(ns bert.bert-sentence-classification
-  (:require [bert.util :as bert-util]
-            [clojure-csv.core :as csv]
-            [clojure.java.shell :refer [sh]]
-            [clojure.string :as string]
-            [org.apache.clojure-mxnet.callback :as callback]
-            [org.apache.clojure-mxnet.context :as context]
-            [org.apache.clojure-mxnet.dtype :as dtype]
-            [org.apache.clojure-mxnet.eval-metric :as eval-metric]
-            [org.apache.clojure-mxnet.io :as mx-io]
-            [org.apache.clojure-mxnet.layout :as layout]
-            [org.apache.clojure-mxnet.module :as m]
-            [org.apache.clojure-mxnet.ndarray :as ndarray]
-            [org.apache.clojure-mxnet.optimizer :as optimizer]
-            [org.apache.clojure-mxnet.symbol :as sym]))
-
-```
-
-# Use the Pre-trained BERT Model
-
-In this tutorial we will use the pre-trained BERT model that was exported from GluonNLP via the `scripts/bert/staticbert/static_export_base.py`. For convenience, the model has been downloaded for you by running  the `get_bert_data.sh` file in the root directory of this example.
-
-## Get BERT
-
-Let’s first take a look at the BERT model architecture for sentence pair classification below:
-
-![bert](https://gluon-nlp.mxnet.io/_images/bert-sentence-pair.png)
-
-where the model takes a pair of sequences and pools the representation of the first token in the sequence. Note that the original BERT model was trained for masked language model and next sentence prediction tasks, which includes layers for language model decoding and classification. These layers will not be used for fine-tuning sentence pair classification.
-
-Let's load the pre-trained BERT using the module API in MXNet.
-
-
-```clojure
-(def model-path-prefix "data/static_bert_base_net")
-;; the vocabulary used in the model
-(def vocab (bert-util/get-vocab))
-;; the input question
-;; the maximum length of the sequence
-(def seq-length 128)
-
-(def bert-base (m/load-checkpoint {:prefix model-path-prefix :epoch 0}))
-```
-
-
-
-
-    #'bert.bert-sentence-classification/bert-base
-
-
-
-## Model Definition for Sentence Pair Classification
-
-Now that we have loaded the BERT model, we only need to attach an additional layer for classification. We can do this by defining a fine tune model from the symbol of the base BERT model.
-
-
-```clojure
-(defn fine-tune-model
-  "msymbol: the pretrained network symbol
-   num-classes: the number of classes for the fine-tune datasets
-   dropout: the dropout rate"
-  [msymbol {:keys [num-classes dropout]}]
-  (as-> msymbol data
-    (sym/dropout {:data data :p dropout})
-    (sym/fully-connected "fc-finetune" {:data data :num-hidden num-classes})
-    (sym/softmax-output "softmax" {:data data})))
-
-(def model-sym (fine-tune-model (m/symbol bert-base) {:num-classes 2 :dropout 0.1}))
-```
-
-
-
-
-    #'bert.bert-sentence-classification/model-sym
-
-
-
-# Data Preprocessing for BERT
-
-## Dataset
-
-For demonstration purpose, we use the dev set of the Microsoft Research Paraphrase Corpus dataset. The file is named ‘dev.tsv’ and was downloaded as part of the data script. Let’s take a look at the raw dataset.
-
-
-```clojure
-(-> (sh "head" "-n" "5" "data/dev.tsv") 
-    :out
-    println)
-```
-
-    ﻿Quality	#1 ID	#2 ID	#1 String	#2 String
-    1	1355540	1355592	He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .	" The foodservice pie business does not fit our long-term growth strategy .
-    0	2029631	2029565	Magnarelli said Racicot hated the Iraqi regime and looked forward to using his long years of training in the war .	His wife said he was " 100 percent behind George Bush " and looked forward to using his years of training in the war .
-    0	487993	487952	The dollar was at 116.92 yen against the yen , flat on the session , and at 1.2891 against the Swiss franc , also flat .	The dollar was at 116.78 yen JPY = , virtually flat on the session , and at 1.2871 against the Swiss franc CHF = , down 0.1 percent .
-    1	1989515	1989458	The AFL-CIO is waiting until October to decide if it will endorse a candidate .	The AFL-CIO announced Wednesday that it will decide in October whether to endorse a candidate before the primaries .
-    
-
-
-The file contains 5 columns, separated by tabs (i.e. ‘
-
-\t
-‘). The first line of the file explains each of these columns: 0. the label indicating whether the two sentences are semantically equivalent 1. the id of the first sentence in this sample 2. the id of the second sentence in this sample 3. the content of the first sentence 4. the content of the second sentence
-
-For our task, we are interested in the 0th, 3rd and 4th columns. 
-
-
-```clojure
-(def raw-file 
-    (csv/parse-csv (string/replace (slurp "data/dev.tsv") "\"" "")
-                   :delimiter \tab
-                   :strict true))
-
-(def data-train-raw (->> raw-file
-                         (mapv #(vals (select-keys % [3 4 0])))
-                         (rest) ; drop header
-                         (into [])))
-
-(def sample (first data-train-raw))
-(println (nth sample 0)) ;;;sentence a
-(println (nth sample 1)) ;; sentence b
-(println (nth sample 2)) ;; 1 means equivalent, 0 means not equivalent
-```
-
-    He said the foodservice pie business doesn 't fit the company 's long-term growth strategy .
-     The foodservice pie business does not fit our long-term growth strategy .
-    1
-
-
-To use the pre-trained BERT model, we need to preprocess the data in the same way it was trained. The following figure shows the input representation in BERT:
-
-![bert-input](https://gluon-nlp.mxnet.io/_images/bert-embed.png)
-
-We will do pre-processing on the inputs to get them in the right format and to perform the following transformations:
-- tokenize the input sequences
-- insert [CLS] at the beginning
-- insert [SEP] between sentence one and sentence two, and at the end - generate segment ids to indicate whether a token belongs to the first sequence or the second sequence.
-- generate valid length
-
-
-```clojure
-(defn pre-processing
-  "Preprocesses the sentences in the format that BERT is expecting"
-  [ctx idx->token token->idx train-item]
-    (let [[sentence-a sentence-b label] train-item
-       ;;; pre-processing tokenize sentence
-          token-1 (bert-util/tokenize (string/lower-case sentence-a))
-          token-2 (bert-util/tokenize (string/lower-case sentence-b))
-          valid-length (+ (count token-1) (count token-2))
-        ;;; generate token types [0000...1111...0000]
-          qa-embedded (into (bert-util/pad [] 0 (count token-1))
-                            (bert-util/pad [] 1 (count token-2)))
-          token-types (bert-util/pad qa-embedded 0 seq-length)
-        ;;; make BERT pre-processing standard
-          token-2 (conj token-2 "[SEP]")
-          token-1 (into [] (concat ["[CLS]"] token-1 ["[SEP]"] token-2))
-          tokens (bert-util/pad token-1 "[PAD]" seq-length)
-        ;;; pre-processing - token to index translation
-          indexes (bert-util/tokens->idxs token->idx tokens)]
-    {:input-batch [indexes
-                   token-types
-                   [valid-length]]
-     :label (if (= "0" label)
-              [0]
-              [1])
-     :tokens tokens
-     :train-item train-item}))
-
-(def idx->token (:idx->token vocab))
-(def token->idx (:token->idx vocab))
-(def dev (context/default-context))
-(def processed-datas (mapv #(pre-processing dev idx->token token->idx %) data-train-raw))
-(def train-count (count processed-datas))
-(println "Train Count is = " train-count)
-(println "[PAD] token id = " (get token->idx "[PAD]"))
-(println "[CLS] token id = " (get token->idx "[CLS]"))
-(println "[SEP] token id = " (get token->idx "[SEP]"))
-(println "token ids = \n"(-> (first processed-datas) :input-batch first)) 
-(println "segment ids = \n"(-> (first processed-datas) :input-batch second)) 
-(println "valid length = \n" (-> (first processed-datas) :input-batch last)) 
-(println "label = \n" (-> (second processed-datas) :label)) 
-
-
-```
-
-    Train Count is =  408
-    [PAD] token id =  1
-    [CLS] token id =  2
-    [SEP] token id =  3
-    token ids = 
-     [2 2002 2056 1996 0 11345 2449 2987 0 4906 1996 2194 0 0 3930 5656 0 1012 3 0 1996 0 11345 2449 2515 2025 4906 2256 0 3930 5656 0 1012 3 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
-    segment ids = 
-     [0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
-    valid length = 
-     [31]
-    label = 
-     [0]
-
-
-Now that we have all the input-batches for each row, we are going to slice them up column-wise and create NDArray Iterators that we can use in training
-
-
-```clojure
-(defn slice-inputs-data
-  "Each sentence pair had to be processed as a row. This breaks all
-  the rows up into a column for creating a NDArray"
-  [processed-datas n]
-  (->> processed-datas
-       (mapv #(nth (:input-batch %) n))
-       (flatten)
-       (into [])))
-
-(def prepared-data {:data0s (slice-inputs-data processed-datas 0)
-                    :data1s (slice-inputs-data processed-datas 1)
-                    :data2s (slice-inputs-data processed-datas 2)
-                    :labels (->> (mapv :label processed-datas)
-                                 (flatten)
-                                 (into []))
-                    :train-num (count processed-datas)})
-
-(def batch-size 32)
-
-(def train-data
-  (let [{:keys [data0s data1s data2s labels train-num]} prepared-data
-        data-desc0 (mx-io/data-desc {:name "data0"
-                                     :shape [train-num seq-length]
-                                     :dtype dtype/FLOAT32
-                                     :layout layout/NT})
-        data-desc1 (mx-io/data-desc {:name "data1"
-                                     :shape [train-num seq-length]
-                                     :dtype dtype/FLOAT32
-                                     :layout layout/NT})
-        data-desc2 (mx-io/data-desc {:name "data2"
-                                     :shape [train-num]
-                                     :dtype dtype/FLOAT32
-                                     :layout layout/N})
-        label-desc (mx-io/data-desc {:name "softmax_label"
-                                     :shape [train-num]
-                                     :dtype dtype/FLOAT32
-                                     :layout layout/N})]
-    (mx-io/ndarray-iter {data-desc0 (ndarray/array data0s [train-num seq-length]
-                                                   {:ctx dev})
-                         data-desc1 (ndarray/array data1s [train-num seq-length]
-                                                   {:ctx dev})
-                         data-desc2 (ndarray/array data2s [train-num]
-                                                   {:ctx dev})}
-                        {:label {label-desc (ndarray/array labels [train-num]
-                                                           {:ctx dev})}
-                         :data-batch-size batch-size})))
-train-data
-```
-
-
-
-
-    #object[org.apache.mxnet.io.NDArrayIter 0x2583097d "non-empty iterator"]
-
-
-
-# Fine-tune BERT Model
-
-Putting everything together, now we can fine-tune the model with a few epochs. For demonstration, we use a fixed learning rate and skip validation steps.
-
-
-```clojure
-(def num-epoch 3)
-
-(def fine-tune-model (m/module model-sym {:contexts [dev]
-                                         :data-names ["data0" "data1" "data2"]}))
-
-(m/fit fine-tune-model {:train-data train-data  :num-epoch num-epoch
-                        :fit-params (m/fit-params {:allow-missing true
-                                                   :arg-params (m/arg-params bert-base)
-                                                   :aux-params (m/aux-params bert-base)
-                                                   :optimizer (optimizer/adam {:learning-rate 5e-6 :episilon 1e-9})
-                                                   :batch-end-callback (callback/speedometer batch-size 1)})})
-
-```
-
-    Speedometer: epoch  0  count  1  metric  [accuracy 0.609375]
-    Speedometer: epoch  0  count  2  metric  [accuracy 0.6041667]
-    Speedometer: epoch  0  count  3  metric  [accuracy 0.5703125]
-    Speedometer: epoch  0  count  4  metric  [accuracy 0.55625]
-    Speedometer: epoch  0  count  5  metric  [accuracy 0.5625]
-    Speedometer: epoch  0  count  6  metric  [accuracy 0.55803573]
-    Speedometer: epoch  0  count  7  metric  [accuracy 0.5625]
-    Speedometer: epoch  0  count  8  metric  [accuracy 0.5798611]
-    Speedometer: epoch  0  count  9  metric  [accuracy 0.584375]
-    Speedometer: epoch  0  count  10  metric  [accuracy 0.57670456]
-    Speedometer: epoch  0  count  11  metric  [accuracy 0.5807292]
-    Speedometer: epoch  0  count  12  metric  [accuracy 0.5793269]
-    Speedometer: epoch  1  count  1  metric  [accuracy 0.5625]
-    Speedometer: epoch  1  count  2  metric  [accuracy 0.5520833]
-    Speedometer: epoch  1  count  3  metric  [accuracy 0.5859375]
-    Speedometer: epoch  1  count  4  metric  [accuracy 0.59375]
-    Speedometer: epoch  1  count  5  metric  [accuracy 0.6145833]
-    Speedometer: epoch  1  count  6  metric  [accuracy 0.625]
-    Speedometer: epoch  1  count  7  metric  [accuracy 0.640625]
-    Speedometer: epoch  1  count  8  metric  [accuracy 0.6527778]
-    Speedometer: epoch  1  count  9  metric  [accuracy 0.653125]
-    Speedometer: epoch  1  count  10  metric  [accuracy 0.6448864]
-    Speedometer: epoch  1  count  11  metric  [accuracy 0.640625]
-    Speedometer: epoch  1  count  12  metric  [accuracy 0.6418269]
-    Speedometer: epoch  2  count  1  metric  [accuracy 0.671875]
-    Speedometer: epoch  2  count  2  metric  [accuracy 0.7083333]
-    Speedometer: epoch  2  count  3  metric  [accuracy 0.7109375]
-    Speedometer: epoch  2  count  4  metric  [accuracy 0.725]
-    Speedometer: epoch  2  count  5  metric  [accuracy 0.7239583]
-    Speedometer: epoch  2  count  6  metric  [accuracy 0.71875]
-    Speedometer: epoch  2  count  7  metric  [accuracy 0.734375]
-    Speedometer: epoch  2  count  8  metric  [accuracy 0.7361111]
-    Speedometer: epoch  2  count  9  metric  [accuracy 0.721875]
-    Speedometer: epoch  2  count  10  metric  [accuracy 0.71022725]
-    Speedometer: epoch  2  count  11  metric  [accuracy 0.6979167]
-    Speedometer: epoch  2  count  12  metric  [accuracy 0.7019231]
-
-
-
-
-
-    #object[org.apache.mxnet.module.Module 0x73c42ae5 "org.apache.mxnet.module.Module@73c42ae5"]
-
-
diff --git a/contrib/clojure-package/examples/bert/get_bert_data.sh b/contrib/clojure-package/examples/bert/get_bert_data.sh
deleted file mode 100755
index 10ed8e9a1f8e..000000000000
--- a/contrib/clojure-package/examples/bert/get_bert_data.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-
-data_path=data
-
-if [ ! -d "$data_path" ]; then
-  mkdir -p "$data_path"
-  curl https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/BertQA/vocab.json -o $data_path/vocab.json
-  curl https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/BertQA/static_bert_qa-0002.params -o $data_path/static_bert_qa-0002.params
-  curl https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/BertQA/static_bert_qa-symbol.json -o $data_path/static_bert_qa-symbol.json
-  curl https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/BertQA/static_bert_base_net-symbol.json -o $data_path/static_bert_base_net-symbol.json
-  curl https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/BertQA/static_bert_base_net-0000.params -o $data_path/static_bert_base_net-0000.params
-  curl https://raw.githubusercontent.com/dmlc/gluon-nlp/master/docs/examples/sentence_embedding/dev.tsv -o $data_path/dev.tsv
-fi
diff --git a/contrib/clojure-package/examples/bert/project.clj b/contrib/clojure-package/examples/bert/project.clj
deleted file mode 100644
index b53e0875951f..000000000000
--- a/contrib/clojure-package/examples/bert/project.clj
+++ /dev/null
@@ -1,32 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-
-(defproject bert "0.1.0-SNAPSHOT"
-  :description "BERT Examples"
-  :plugins [[lein-cljfmt "0.5.7"]
-            ;;; lein-jupyter seems to have some incompatibilities with dependencies with cider
-            ;;; so if you run into trouble please delete the `lein-juptyter` plugin
-            [lein-jupyter "0.1.16" :exclusions [org.clojure/tools.nrepl org.clojure/clojure org.codehaus.plexus/plexus-utils org.clojure/tools.reader]]]
-  :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "2.0.0-SNAPSHOT"]
-                 [cheshire "5.8.1"]
-                 [clojure-csv/clojure-csv "2.0.1"]]
-  :pedantic? :skip
-  :java-source-paths ["src/java"]
-  :main bert.infer
-  :repl-options {:init-ns bert.infer})
diff --git a/contrib/clojure-package/examples/bert/squad-samples.edn b/contrib/clojure-package/examples/bert/squad-samples.edn
deleted file mode 100644
index e99a181f7d17..000000000000
--- a/contrib/clojure-package/examples/bert/squad-samples.edn
+++ /dev/null
@@ -1,39 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-
-[{:input-answer "Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm."
-  :input-question "By what main attribute are computational problems classified utilizing computational complexity theory?"
-  :ground-truth-answers ["Computational complexity theory"
-                         "Computational  complexity theory"
-                         "complexity theory"]}
- {:input-answer "Steam engines are external combustion engines, where the working fluid is separate from the combustion products. Non-combustion heat sources such as solar power, nuclear power or geothermal energy may be used. The ideal thermodynamic cycle used to analyze this process is called the Rankine cycle. In the cycle, water is heated and transforms into steam within a boiler operating at a high pressure. When expanded through pistons or turbines, mechanical work is done. The reduced-pressure steam is then condensed and pumped back into the boiler."
-  :input-question "Along with geothermal and nuclear, what is a notable non-combustion heat source?"
-  :ground-truth-answers ["solar"
-                         "solar power"
-                         "solar power, nuclear power or geothermal energysolar"]}
- {:input-answer "In the 1960s, a series of discoveries, the most important of which was seafloor spreading, showed that the Earth's lithosphere, which includes the crust and rigid uppermost portion of the upper mantle, is separated into a number of tectonic plates that move across the plastically deforming, solid, upper mantle, which is called the asthenosphere. There is an intimate coupling between the movement of the plates on the surface and the convection of the mantle: oceanic plate motions and mantle convection currents always move in the same direction, because the oceanic lithosphere is the rigid upper thermal boundary layer of the convecting mantle. This coupling between rigid plates moving on the surface of the Earth and the convecting mantle is called plate tectonics."
-  :input-question "What was the most important discovery that led to the understanding that Earth's lithosphere is separated into tectonic plates?"
-  :ground-truth-answers ["seafloor spreading"]}
- ;;; totally made up
- {:input-answer "Susan had a cat named Sammy when she lived in the green house."
-  :input-question "What was Susan's cat named?"
-  :ground-truth-answers ["Sammy" "sammy"]}
- ;;; more or less from wikipedia on clojure
- {:input-answer "Rich Hickey is the creator of the Clojure language. Before Clojure, he developed dotLisp, a similar project based on the .NET platform, and three earlier attempts to provide interoperability between Lisp and Java: a Java foreign language interface for Common Lisp, A Foreign Object Interface for Lisp, and a Lisp-friendly interface to Java Servlets."
-  :input-question "Who created Clojure?"
-  :ground-truth-answers ["rich" "hickey"]}]
diff --git a/contrib/clojure-package/examples/bert/src/bert/bert_sentence_classification.clj b/contrib/clojure-package/examples/bert/src/bert/bert_sentence_classification.clj
deleted file mode 100644
index 6ec4d586ad17..000000000000
--- a/contrib/clojure-package/examples/bert/src/bert/bert_sentence_classification.clj
+++ /dev/null
@@ -1,225 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns bert.bert-sentence-classification
-  "Fine-tuning Sentence Pair Classification with BERT
-  This tutorial focuses on fine-tuning with the pre-trained BERT model to classify semantically equivalent sentence pairs.
-
-  Specifically, we will:
-    1. load the state-of-the-art pre-trained BERT model
-    2. attach an additional layer for classification
-    3. process and transform sentence pair data for the task at hand
-    4. fine-tune BERT model for sentence classification"
-  (:require [bert.util :as bert-util]
-            [clojure-csv.core :as csv]
-            [clojure.string :as string]
-            [org.apache.clojure-mxnet.callback :as callback]
-            [org.apache.clojure-mxnet.context :as context]
-            [org.apache.clojure-mxnet.dtype :as dtype]
-            [org.apache.clojure-mxnet.infer :as infer]
-            [org.apache.clojure-mxnet.io :as mx-io]
-            [org.apache.clojure-mxnet.layout :as layout]
-            [org.apache.clojure-mxnet.module :as m]
-            [org.apache.clojure-mxnet.ndarray :as ndarray]
-            [org.apache.clojure-mxnet.optimizer :as optimizer]
-            [org.apache.clojure-mxnet.symbol :as sym]))
-
-;; Pre-trained language representations have been shown to improve
-;; many downstream NLP tasks such as question answering, and natural
-;; language inference. To apply pre-trained representations to these
-;; tasks, there are two strategies:
-
-;;  *  feature-based approach, which uses the pre-trained representations as additional features to the downstream task.
-;;  * fine-tuning based approach, which trains the downstream tasks by fine-tuning pre-trained parameters.
-
-;; While feature-based approaches such as ELMo are effective in
-;; improving many downstream tasks, they require task-specific
-;; architectures. Devlin, Jacob, et al proposed BERT (Bidirectional
-;; Encoder Representations from Transformers), which fine-tunes deep
-;; bidirectional representations on a wide range of tasks with minimal
-;; task-specific parameters, and obtained state-of-the-art results.
-
-(def model-path-prefix "data/static_bert_base_net")
-
-(def fine-tuned-prefix "fine-tune-sentence-bert")
-
-;; the maximum length of the sequence
-(def seq-length 128)
-
-(defn pre-processing
-  "Preprocesses the sentences in the format that BERT is expecting"
-  [idx->token token->idx train-item]
-  (let [[sentence-a sentence-b label] train-item
-        ;; pre-processing tokenize sentence
-        token-1 (bert-util/tokenize (string/lower-case sentence-a))
-        token-2 (bert-util/tokenize (string/lower-case sentence-b))
-        valid-length (+ (count token-1) (count token-2))
-        ;; generate token types [0000...1111...0000]
-        qa-embedded (into (bert-util/pad [] 0 (count token-1))
-                          (bert-util/pad [] 1 (count token-2)))
-        token-types (bert-util/pad qa-embedded 0 seq-length)
-        ;; make BERT pre-processing standard
-        token-2 (conj token-2 "[SEP]")
-        token-1 (into [] (concat ["[CLS]"] token-1 ["[SEP]"] token-2))
-        tokens (bert-util/pad token-1 "[PAD]" seq-length)
-        ;; pre-processing - token to index translation
-        indexes (bert-util/tokens->idxs token->idx tokens)]
-    {:input-batch [indexes
-                   token-types
-                   [valid-length]]
-     :label (if (= "0" label)
-              [0]
-              [1])
-     :tokens tokens
-     :train-item train-item}))
-
-(defn fine-tune-model
-  "msymbol: the pretrained network symbol
-   num-classes: the number of classes for the fine-tune datasets
-   dropout: The dropout rate amount"
-  [msymbol {:keys [num-classes dropout]}]
-  (as-> msymbol data
-    (sym/dropout {:data data :p dropout})
-    (sym/fully-connected "fc-finetune" {:data data :num-hidden num-classes})
-    (sym/softmax-output "softmax" {:data data})))
-
-(defn slice-inputs-data
-  "Each sentence pair had to be processed as a row. This breaks all
-  the rows up into a column for creating a NDArray"
-  [processed-datas n]
-  (->> processed-datas
-       (mapv #(nth (:input-batch %) n))
-       (flatten)
-       (into [])))
-
-(defn get-raw-data []
-  (csv/parse-csv (string/replace (slurp "data/dev.tsv") "\"" "")
-                 :delimiter \tab
-                 :strict true))
-
-(defn prepare-data
-  "This prepares the sentence pairs into NDArrays for use in NDArrayIterator"
-  [raw-data]
-  (let [vocab (bert-util/get-vocab)
-        idx->token (:idx->token vocab)
-        token->idx (:token->idx vocab)
-        data-train-raw (->> raw-data
-                            (mapv #(vals (select-keys % [3 4 0])))
-                            (rest) ; drop header
-                            (into []))
-        processed-datas (mapv #(pre-processing idx->token token->idx %) data-train-raw)]
-    {:data0s (slice-inputs-data processed-datas 0)
-     :data1s (slice-inputs-data processed-datas 1)
-     :data2s (slice-inputs-data processed-datas 2)
-     :labels (->> (mapv :label processed-datas)
-                  (flatten)
-                  (into []))
-     :train-num (count processed-datas)}))
-
-(defn train
-  "Trains (fine tunes) the sentence pairs for a classification task on the BERT Base model"
-  [dev num-epoch]
-  (let [bert-base (m/load-checkpoint {:prefix model-path-prefix :epoch 0})
-        model-sym (fine-tune-model (m/symbol bert-base) {:num-classes 2 :dropout 0.1})
-        {:keys [data0s data1s data2s labels train-num]} (prepare-data (get-raw-data))
-        batch-size 32
-        data-desc0 (mx-io/data-desc {:name "data0"
-                                     :shape [train-num seq-length]
-                                     :dtype dtype/FLOAT32
-                                     :layout layout/NT})
-        data-desc1 (mx-io/data-desc {:name "data1"
-                                     :shape [train-num seq-length]
-                                     :dtype dtype/FLOAT32
-                                     :layout layout/NT})
-        data-desc2 (mx-io/data-desc {:name "data2"
-                                     :shape [train-num]
-                                     :dtype dtype/FLOAT32
-                                     :layout layout/N})
-        label-desc (mx-io/data-desc {:name "softmax_label"
-                                     :shape [train-num]
-                                     :dtype dtype/FLOAT32
-                                     :layout layout/N})
-        train-data  (mx-io/ndarray-iter {data-desc0 (ndarray/array data0s [train-num seq-length]
-                                                                   {:ctx dev})
-                                         data-desc1 (ndarray/array data1s [train-num seq-length]
-                                                                   {:ctx dev})
-                                         data-desc2 (ndarray/array data2s [train-num]
-                                                                   {:ctx dev})}
-                                        {:label {label-desc (ndarray/array labels [train-num]
-                                                                           {:ctx dev})}
-                                         :data-batch-size batch-size})
-        fitted-model (m/fit (m/module model-sym {:contexts [dev]
-                                                 :data-names ["data0" "data1" "data2"]})
-                            {:train-data train-data  :num-epoch num-epoch
-                             :fit-params (m/fit-params {:allow-missing true
-                                                        :arg-params (m/arg-params bert-base)
-                                                        :aux-params (m/aux-params bert-base)
-                                                        :optimizer (optimizer/adam {:learning-rate 5e-6 :epsilon 1e-9})
-                                                        :batch-end-callback (callback/speedometer batch-size 1)})})]
-    (m/save-checkpoint fitted-model {:prefix fine-tuned-prefix :epoch num-epoch})
-    fitted-model))
-
-(defn -main [& args]
-  (let [[dev-arg num-epoch-arg] args
-        dev (if (= dev-arg ":gpu") (context/gpu) (context/cpu))
-        num-epoch (if num-epoch-arg (Integer/parseInt num-epoch-arg) 3)]
-    (println "Running example with " dev " and " num-epoch " epochs ")
-    (train dev num-epoch)))
-
-;; For evaluating the model
-(defn predict-equivalence
-  "Get the fine-tuned model's opinion on whether two sentences are equivalent:"
-  [predictor sentence1 sentence2]
-  (let [vocab (bert.util/get-vocab)
-        processed-test-data (mapv #(pre-processing (:idx->token vocab)
-                                                   (:token->idx vocab) %)
-                                  [[sentence1 sentence2]])
-        prediction (infer/predict-with-ndarray predictor
-                                               [(ndarray/array (slice-inputs-data processed-test-data 0) [1 seq-length])
-                                                (ndarray/array (slice-inputs-data processed-test-data 1) [1 seq-length])
-                                                (ndarray/array (slice-inputs-data processed-test-data 2) [1])])]
-    (ndarray/->vec (first prediction))))
-
-(comment
-
-  (train (context/cpu 0) 3)
-
-  (m/save-checkpoint model {:prefix fine-tuned-prefix :epoch 3})
-
-  
-  ;;;; Explore results from the fine-tuned model
-
-  ;; We need a predictor with a batch size of 1, so we can feed the
-  ;; model a single sentence pair.
-  (def fine-tuned-predictor
-    (infer/create-predictor (infer/model-factory fine-tuned-prefix
-                                                 [{:name "data0" :shape [1 seq-length] :dtype dtype/FLOAT32 :layout layout/NT}
-                                                  {:name "data1" :shape [1 seq-length] :dtype dtype/FLOAT32 :layout layout/NT}
-                                                  {:name "data2" :shape [1]            :dtype dtype/FLOAT32 :layout layout/N}])
-                            {:epoch 3}))
-
-  ;; Modify an existing sentence pair to test:
-  ;; ["1"
-  ;;  "69773"
-  ;;  "69792"
-  ;;  "Cisco pared spending to compensate for sluggish sales ."
-  ;;  "In response to sluggish sales , Cisco pared spending ."]
-  (predict-equivalence fine-tuned-predictor
-                       "The company cut spending to compensate for weak sales ."
-                       "In response to poor sales results, the company cut spending .")  
-
-  )
diff --git a/contrib/clojure-package/examples/bert/src/bert/infer.clj b/contrib/clojure-package/examples/bert/src/bert/infer.clj
deleted file mode 100644
index 2a08dab36f85..000000000000
--- a/contrib/clojure-package/examples/bert/src/bert/infer.clj
+++ /dev/null
@@ -1,129 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-
-
-(ns bert.infer
-  (:require [bert.util :as bert-util]
-            [clojure.pprint :as pprint]
-            [clojure.string :as string]
-            [org.apache.clojure-mxnet.context :as context]
-            [org.apache.clojure-mxnet.dtype :as dtype]
-            [org.apache.clojure-mxnet.infer :as infer]
-            [org.apache.clojure-mxnet.layout :as layout]
-            [org.apache.clojure-mxnet.ndarray :as ndarray]
-            [org.apache.clojure-mxnet.util :as util]))
-
-(def model-path-prefix "data/static_bert_qa")
-;; epoch number of the model
-(def epoch 2)
-;; the maximum length of the sequence
-(def seq-length 384)
-
-;;; data helpers
-
-(defn post-processing [result tokens]
-  (let [output1 (ndarray/slice-axis result 2 0 1)
-        output2 (ndarray/slice-axis result 2 1 2)
-        ;;; get the formatted logits result
-        start-logits (ndarray/reshape output1 [0 -3])
-        end-logits (ndarray/reshape output2 [0 -3])
-        start-prob (ndarray/softmax start-logits)
-        end-prob (ndarray/softmax end-logits)
-        start-idx (-> (ndarray/argmax start-prob 1)
-                      (ndarray/->vec)
-                      (first))
-        end-idx (-> (ndarray/argmax end-prob 1)
-                    (ndarray/->vec)
-                    (first))]
-    (if (> end-idx start-idx)
-      (subvec tokens start-idx (inc end-idx))
-      (subvec tokens end-idx (inc end-idx)))))
-
-(defn make-predictor [ctx]
-  (let [input-descs [{:name "data0"
-                      :shape [1 seq-length]
-                      :dtype dtype/FLOAT32
-                      :layout layout/NT}
-                     {:name "data1"
-                      :shape [1 seq-length]
-                      :dtype dtype/FLOAT32
-                      :layout layout/NT}
-                     {:name "data2"
-                      :shape [1]
-                      :dtype dtype/FLOAT32
-                      :layout layout/N}]
-        factory (infer/model-factory model-path-prefix input-descs)]
-    (infer/create-predictor
-     factory
-     {:contexts [ctx]
-      :epoch epoch})))
-
-(defn pre-processing [ctx idx->token token->idx qa-map]
-  (let [{:keys [input-question input-answer ground-truth-answers]} qa-map
-       ;;; pre-processing tokenize sentence
-        token-q (bert-util/tokenize (string/lower-case input-question))
-        token-a (bert-util/tokenize (string/lower-case input-answer))
-        valid-length (+ (count token-q) (count token-a))
-        ;;; generate token types [0000...1111...0000]
-        qa-embedded (into (bert-util/pad [] 0 (count token-q))
-                          (bert-util/pad [] 1 (count token-a)))
-        token-types (bert-util/pad qa-embedded 0 seq-length)
-        ;;; make BERT pre-processing standard
-        token-a (conj token-a "[SEP]")
-        token-q (into [] (concat ["[CLS]"] token-q ["[SEP]"] token-a))
-        tokens (bert-util/pad token-q "[PAD]" seq-length)
-        ;;; pre-processing - token to index translation
-
-        indexes (bert-util/tokens->idxs token->idx tokens)]
-    {:input-batch [(ndarray/array indexes [1 seq-length] {:context ctx})
-                   (ndarray/array token-types [1 seq-length] {:context ctx})
-                   (ndarray/array [valid-length] [1] {:context ctx})]
-     :tokens tokens
-     :qa-map qa-map}))
-
-(defn infer
-  ([] (infer (context/default-context)))
-  ([ctx]
-   (let [predictor (make-predictor ctx)
-         {:keys [idx->token token->idx]} (bert-util/get-vocab)
-        ;;; samples taken from https://rajpurkar.github.io/SQuAD-explorer/explore/v2.0/dev/
-         question-answers (clojure.edn/read-string (slurp "squad-samples.edn"))]
-     (doseq [qa-map question-answers]
-       (let [{:keys [input-batch tokens qa-map]} (pre-processing ctx idx->token token->idx qa-map)
-             result (first (infer/predict-with-ndarray predictor input-batch))
-             answer (post-processing result tokens)]
-         (println "===============================")
-         (println "      Question Answer Data")
-         (pprint/pprint qa-map)
-         (println)
-         (println "  Predicted Answer: " answer)
-         (println "==============================="))))))
-
-(defn -main [& args]
-  (let [[dev] args]
-    (if (= dev ":gpu")
-      (infer (context/gpu))
-      (infer (context/cpu)))))
-
-(comment
-
-  (infer)
-
-  (infer (context/gpu))
-
-  )
diff --git a/contrib/clojure-package/examples/bert/src/bert/util.clj b/contrib/clojure-package/examples/bert/src/bert/util.clj
deleted file mode 100644
index 061e12b4e8de..000000000000
--- a/contrib/clojure-package/examples/bert/src/bert/util.clj
+++ /dev/null
@@ -1,52 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns bert.util
-  (:require [clojure.java.io :as io]
-            [clojure.string :as string]
-            [cheshire.core :as json]))
-
-(defn break-out-punctuation [s str-match]
-  (->> (string/split (str s "<punc>") (re-pattern (str "\\" str-match)))
-       (map #(string/replace % "<punc>" str-match))))
-
-(defn break-out-punctuations [s]
-  (if-let [target-char (first (re-seq #"[.,?!]" s))]
-    (break-out-punctuation s target-char)
-    [s]))
-
-(defn tokenize [s]
-  (->> (string/split s #"\s+")
-       (mapcat break-out-punctuations)
-       (into [])))
-
-(defn pad [tokens pad-item num]
-  (if (>= (count tokens) num)
-    tokens
-    (into tokens (repeat (- num (count tokens)) pad-item))))
-
-(defn get-vocab []
-  (let [vocab (json/parse-stream (io/reader "data/vocab.json"))]
-    {:idx->token (get vocab "idx_to_token")
-     :token->idx (get vocab "token_to_idx")}))
-
-(defn tokens->idxs [token->idx tokens]
-  (let [unk-idx (get token->idx "[UNK]")]
-    (mapv #(get token->idx % unk-idx) tokens)))
-
-(defn idxs->tokens [idx->token idxs]
-  (mapv #(get idx->token %) idxs))
diff --git a/contrib/clojure-package/examples/bert/test/bert/bert_sentence_classification_test.clj b/contrib/clojure-package/examples/bert/test/bert/bert_sentence_classification_test.clj
deleted file mode 100644
index c26301e34fe6..000000000000
--- a/contrib/clojure-package/examples/bert/test/bert/bert_sentence_classification_test.clj
+++ /dev/null
@@ -1,104 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-
-(ns bert.bert-sentence-classification-test
-  (:require [bert.bert-sentence-classification :refer :all]
-            [clojure-csv.core :as csv]
-            [clojure.java.io :as io]
-            [clojure.java.shell :refer [sh]]
-            [clojure.test :refer :all]
-            [org.apache.clojure-mxnet.callback :as callback]
-            [org.apache.clojure-mxnet.context :as context]
-            [org.apache.clojure-mxnet.dtype :as dtype]
-            [org.apache.clojure-mxnet.eval-metric :as eval-metric]
-            [org.apache.clojure-mxnet.infer :as infer]
-            [org.apache.clojure-mxnet.io :as mx-io]
-            [org.apache.clojure-mxnet.layout :as layout]
-            [org.apache.clojure-mxnet.ndarray :as ndarray]
-            [org.apache.clojure-mxnet.optimizer :as optimizer]
-            [org.apache.clojure-mxnet.module :as m]))
-
-(def model-dir "data/")
-
-(def test-prefix "test-fine-tuning-bert-sentence-pairs")
-
-(when-not (.exists (io/file (str model-dir "static_bert_qa-0002.params")))
-  (println "Downloading bert qa data")
-  (sh "./get_bert_data.sh"))
-
-(defn get-slim-raw-data []
-  (take 32 (csv/parse-csv (slurp "data/dev.tsv") :delimiter \tab)))
-
-(deftest train-test
-  (with-redefs [get-raw-data get-slim-raw-data]
-    (let [dev (context/default-context)
-          num-epoch 1
-          bert-base (m/load-checkpoint {:prefix model-path-prefix :epoch 0})
-          model-sym (fine-tune-model (m/symbol bert-base) {:num-classes 2 :dropout 0.1})
-          {:keys [data0s data1s data2s labels train-num]} (prepare-data (get-raw-data))
-          batch-size 32
-          data-desc0 (mx-io/data-desc {:name "data0"
-                                       :shape [train-num seq-length]
-                                       :dtype dtype/FLOAT32
-                                       :layout layout/NT})
-          data-desc1 (mx-io/data-desc {:name "data1"
-                                       :shape [train-num seq-length]
-                                       :dtype dtype/FLOAT32
-                                       :layout layout/NT})
-          data-desc2 (mx-io/data-desc {:name "data2"
-                                       :shape [train-num]
-                                       :dtype dtype/FLOAT32
-                                       :layout layout/N})
-          label-desc (mx-io/data-desc {:name "softmax_label"
-                                       :shape [train-num]
-                                       :dtype dtype/FLOAT32
-                                       :layout layout/N})
-          train-data  (mx-io/ndarray-iter {data-desc0 (ndarray/array data0s [train-num seq-length]
-                                                                     {:ctx dev})
-                                           data-desc1 (ndarray/array data1s [train-num seq-length]
-                                                                     {:ctx dev})
-                                           data-desc2 (ndarray/array data2s [train-num]
-                                                                     {:ctx dev})}
-                                          {:label {label-desc (ndarray/array labels [train-num]
-                                                                             {:ctx dev})}
-                                           :data-batch-size batch-size})
-          model (m/module model-sym {:contexts [dev]
-                                     :data-names ["data0" "data1" "data2"]})]
-      (m/fit model {:train-data train-data  :num-epoch num-epoch
-                    :fit-params (m/fit-params {:allow-missing true
-                                               :arg-params (m/arg-params bert-base)
-                                               :aux-params (m/aux-params bert-base)
-                                               :optimizer (optimizer/adam {:learning-rate 5e-6 :episilon 1e-9})
-                                               :batch-end-callback (callback/speedometer batch-size 1)})})
-      (m/save-checkpoint model {:prefix test-prefix :epoch num-epoch})
-      (testing "accuracy"
-        (is (< 0.5 (last (m/score model {:eval-data train-data :eval-metric (eval-metric/accuracy)})))))
-      (testing "prediction"
-        (let [test-predictor (infer/create-predictor (infer/model-factory test-prefix
-                                                                          [{:name "data0" :shape [1 seq-length] :dtype dtype/FLOAT32 :layout layout/NT}
-                                                                           {:name "data1" :shape [1 seq-length] :dtype dtype/FLOAT32 :layout layout/NT}
-                                                                           {:name "data2" :shape [1]            :dtype dtype/FLOAT32 :layout layout/N}])
-                                                     {:epoch num-epoch})
-              prediction (predict-equivalence test-predictor
-                                              "The company cut spending to compensate for weak sales ."
-                                              "In response to poor sales results, the company cut spending .")]
-          ;; We can't say much about how the model will find this prediction, so we test only the prediction's shape.
-          (is (vector? prediction))
-          (is (number? (first prediction)))
-          (is (number? (second prediction)))
-          (is (= 2 (count prediction))))))))
diff --git a/contrib/clojure-package/examples/bert/test/bert/infer_test.clj b/contrib/clojure-package/examples/bert/test/bert/infer_test.clj
deleted file mode 100644
index 48ee3a89b177..000000000000
--- a/contrib/clojure-package/examples/bert/test/bert/infer_test.clj
+++ /dev/null
@@ -1,43 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-
-(ns bert.infer-test
-  (:require [bert.infer :refer :all]
-            [bert.util :as util]
-            [clojure.java.io :as io]
-            [clojure.java.shell :refer [sh]]
-            [clojure.test :refer :all]
-            [org.apache.clojure-mxnet.context :as context]
-            [org.apache.clojure-mxnet.infer :as infer]))
-
-(def model-dir "data/")
-
-(when-not (.exists (io/file (str model-dir "static_bert_qa-0002.params")))
-  (println "Downloading bert qa data")
-  (sh "./get_bert_data.sh"))
-
-(deftest infer-test
-  (let [ctx (context/default-context)
-        predictor (make-predictor ctx)
-        {:keys [idx->token token->idx]} (util/get-vocab)
-        ;;; samples taken from https://rajpurkar.github.io/SQuAD-explorer/explore/v2.0/dev/
-        question-answers (clojure.edn/read-string (slurp "squad-samples.edn"))]
-    (let [qa-map (last question-answers)
-          {:keys [input-batch tokens qa-map]} (pre-processing ctx idx->token token->idx qa-map)
-          result (first (infer/predict-with-ndarray predictor input-batch))]
-      (is (= ["rich" "hickey"] (post-processing result tokens))))))
diff --git a/contrib/clojure-package/examples/captcha/.gitignore b/contrib/clojure-package/examples/captcha/.gitignore
deleted file mode 100644
index e1569bd89020..000000000000
--- a/contrib/clojure-package/examples/captcha/.gitignore
+++ /dev/null
@@ -1,3 +0,0 @@
-/.lein-*
-/.nrepl-port
-images/*
diff --git a/contrib/clojure-package/examples/captcha/README.md b/contrib/clojure-package/examples/captcha/README.md
deleted file mode 100644
index be71e07b2120..000000000000
--- a/contrib/clojure-package/examples/captcha/README.md
+++ /dev/null
@@ -1,78 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# Captcha
-
-This is the clojure version of [captcha recognition](https://github.com/xlvector/learning-dl/tree/master/mxnet/ocr)
-example by xlvector and mirrors the R captcha example. It can be used as an
-example of multi-label training. For the following captcha example, we consider it as an
-image with 4 labels and train a CNN over the data set.
-
-![captcha example](captcha_example.png)
-
-## Installation
-
-Before you run this example, make sure that you have the clojure package
-installed. In the main clojure package directory, do `lein install`.
-Then you can run `lein install` in this directory.
-
-## Usage
-
-### Training
-
-First the OCR model needs to be trained based on [labeled data](https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/captcha_example.zip).
-The training can be started using the following:
-```
-$ lein train [:cpu|:gpu] [num-devices]
-```
-This downloads the training/evaluation data using the `get_data.sh` script
-before starting training.
-
-It is possible that you will encounter some out-of-memory issues while training using :gpu on Ubuntu
-linux (18.04). However, the command `lein train` (training on one CPU) may resolve the issue.
-
-The training runs for 10 iterations by default and saves the model with the
-prefix `ocr-`. The model achieved an exact match accuracy of ~0.954 and
-~0.628 on training and validation data respectively.
-
-### Inference
-
-Once the model has been saved, it can be used for prediction. This can be done
-by running:
-```
-$ lein infer
-INFO  MXNetJVM: Try loading mxnet-scala from native path.
-INFO  MXNetJVM: Try loading mxnet-scala-linux-x86_64-gpu from native path.
-INFO  MXNetJVM: Try loading mxnet-scala-linux-x86_64-cpu from native path.
-WARN  MXNetJVM: MXNet Scala native library not found in path. Copying native library from the archive. Consider installing the library somewhere in the path (for Windows: PATH, for Linux: LD_LIBRARY_PATH), or specifying by Java cmd option -Djava.library.path=[lib path].
-WARN  org.apache.mxnet.DataDesc: Found Undefined Layout, will use default index 0 for batch axis
-INFO  org.apache.mxnet.infer.Predictor: Latency increased due to batchSize mismatch 8 vs 1
-WARN  org.apache.mxnet.DataDesc: Found Undefined Layout, will use default index 0 for batch axis
-WARN  org.apache.mxnet.DataDesc: Found Undefined Layout, will use default index 0 for batch axis
-CAPTCHA output: 6643
-INFO  org.apache.mxnet.util.NativeLibraryLoader: Deleting /tmp/mxnet6045308279291774865/libmxnet.so
-INFO  org.apache.mxnet.util.NativeLibraryLoader: Deleting /tmp/mxnet6045308279291774865/mxnet-scala
-INFO  org.apache.mxnet.util.NativeLibraryLoader: Deleting /tmp/mxnet6045308279291774865
-```
-The model runs on `captcha_example.png` by default.
-
-It can be run on other generated captcha images as well. The script
-`gen_captcha.py` generates random captcha images for length 4.
-Before running the python script, you will need to install the [captcha](https://pypi.org/project/captcha/)
-library using `pip3 install --user captcha`. The captcha images are generated
-in the `images/` folder and we can run the prediction using
-`lein infer images/7534.png`.
diff --git a/contrib/clojure-package/examples/captcha/captcha_example.png b/contrib/clojure-package/examples/captcha/captcha_example.png
deleted file mode 100644
index 09b84f7190fa..000000000000
Binary files a/contrib/clojure-package/examples/captcha/captcha_example.png and /dev/null differ
diff --git a/contrib/clojure-package/examples/captcha/gen_captcha.py b/contrib/clojure-package/examples/captcha/gen_captcha.py
deleted file mode 100644
index 43e0d26fb961..000000000000
--- a/contrib/clojure-package/examples/captcha/gen_captcha.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#!/usr/bin/env python3
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-from captcha.image import ImageCaptcha
-import os
-import random
-
-length = 4
-width = 160
-height = 60
-IMAGE_DIR = "images"
-
-
-def random_text():
-    return ''.join(str(random.randint(0, 9))
-                   for _ in range(length))
-
-
-if __name__ == '__main__':
-    image = ImageCaptcha(width=width, height=height)
-    captcha_text = random_text()
-    if not os.path.exists(IMAGE_DIR):
-        os.makedirs(IMAGE_DIR)
-    image.write(captcha_text, os.path.join(IMAGE_DIR, captcha_text + ".png"))
diff --git a/contrib/clojure-package/examples/captcha/get_data.sh b/contrib/clojure-package/examples/captcha/get_data.sh
deleted file mode 100755
index baa7f9eb818f..000000000000
--- a/contrib/clojure-package/examples/captcha/get_data.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -evx
-
-EXAMPLE_ROOT=$(cd "$(dirname $0)"; pwd)
-
-data_path=$EXAMPLE_ROOT
-
-if [ ! -f "$data_path/captcha_example.zip" ]; then
-  wget https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/captcha_example.zip -P $data_path
-fi
-
-if [ ! -f "$data_path/captcha_example/captcha_train.rec" ]; then
-  unzip $data_path/captcha_example.zip -d $data_path
-fi
diff --git a/contrib/clojure-package/examples/captcha/project.clj b/contrib/clojure-package/examples/captcha/project.clj
deleted file mode 100644
index 7bc862c29e3d..000000000000
--- a/contrib/clojure-package/examples/captcha/project.clj
+++ /dev/null
@@ -1,28 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(defproject captcha "0.1.0-SNAPSHOT"
-  :description "Captcha recognition via multi-label classification"
-  :plugins [[lein-cljfmt "0.5.7"]]
-  :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "2.0.0-SNAPSHOT"]]
-  :main ^:skip-aot captcha.train-ocr
-  :profiles {:train {:main captcha.train-ocr}
-             :infer {:main captcha.infer-ocr}
-             :uberjar {:aot :all}}
-  :aliases {"train" ["with-profile" "train" "run"]
-            "infer" ["with-profile" "infer" "run"]})
diff --git a/contrib/clojure-package/examples/captcha/src/captcha/consts.clj b/contrib/clojure-package/examples/captcha/src/captcha/consts.clj
deleted file mode 100644
index 318e0d806873..000000000000
--- a/contrib/clojure-package/examples/captcha/src/captcha/consts.clj
+++ /dev/null
@@ -1,27 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns captcha.consts)
-
-(def batch-size 8)
-(def channels 3)
-(def height 30)
-(def width 80)
-(def data-shape [channels height width])
-(def num-labels 10)
-(def label-width 4)
-(def model-prefix "ocr")
diff --git a/contrib/clojure-package/examples/captcha/src/captcha/infer_ocr.clj b/contrib/clojure-package/examples/captcha/src/captcha/infer_ocr.clj
deleted file mode 100644
index f6a648e9867b..000000000000
--- a/contrib/clojure-package/examples/captcha/src/captcha/infer_ocr.clj
+++ /dev/null
@@ -1,56 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns captcha.infer-ocr
-  (:require [captcha.consts :refer :all]
-            [org.apache.clojure-mxnet.dtype :as dtype]
-            [org.apache.clojure-mxnet.infer :as infer]
-            [org.apache.clojure-mxnet.layout :as layout]
-            [org.apache.clojure-mxnet.ndarray :as ndarray]))
-
-(defn create-predictor
-  []
-  (let [data-desc {:name "data"
-                   :shape [batch-size channels height width]
-                   :layout layout/NCHW
-                   :dtype dtype/FLOAT32}
-        label-desc {:name "label"
-                    :shape [batch-size label-width]
-                    :layout layout/NT
-                    :dtype dtype/FLOAT32}
-        factory (infer/model-factory model-prefix
-                                     [data-desc label-desc])]
-    (infer/create-predictor factory)))
-
-(defn -main
-  [& args]
-  (let [[filename] args
-        image-fname (or filename "captcha_example.png")
-        image-ndarray (-> image-fname
-                          infer/load-image-from-file
-                          (infer/reshape-image width height)
-                          (infer/buffered-image-to-pixels [channels height width])
-                          (ndarray/expand-dims 0))
-        label-ndarray (ndarray/zeros [1 label-width])
-        predictor (create-predictor)
-        predictions (-> (infer/predict-with-ndarray
-                         predictor
-                         [image-ndarray label-ndarray])
-                        first
-                        (ndarray/argmax 1)
-                        ndarray/->vec)]
-    (println "CAPTCHA output:" (apply str (mapv int predictions)))))
diff --git a/contrib/clojure-package/examples/captcha/src/captcha/train_ocr.clj b/contrib/clojure-package/examples/captcha/src/captcha/train_ocr.clj
deleted file mode 100644
index 91ec2fff3af7..000000000000
--- a/contrib/clojure-package/examples/captcha/src/captcha/train_ocr.clj
+++ /dev/null
@@ -1,156 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns captcha.train-ocr
-  (:require [captcha.consts :refer :all]
-            [clojure.java.io :as io]
-            [clojure.java.shell :refer [sh]]
-            [org.apache.clojure-mxnet.callback :as callback]
-            [org.apache.clojure-mxnet.context :as context]
-            [org.apache.clojure-mxnet.eval-metric :as eval-metric]
-            [org.apache.clojure-mxnet.initializer :as initializer]
-            [org.apache.clojure-mxnet.io :as mx-io]
-            [org.apache.clojure-mxnet.module :as m]
-            [org.apache.clojure-mxnet.ndarray :as ndarray]
-            [org.apache.clojure-mxnet.optimizer :as optimizer]
-            [org.apache.clojure-mxnet.symbol :as sym])
-  (:gen-class))
-
-(when-not (.exists (io/file "captcha_example/captcha_train.lst"))
-  (sh "./get_data.sh"))
-
-(defonce train-data
-  (mx-io/image-record-iter {:path-imgrec "captcha_example/captcha_train.rec"
-                            :path-imglist "captcha_example/captcha_train.lst"
-                            :batch-size batch-size
-                            :label-width label-width
-                            :data-shape data-shape
-                            :shuffle true
-                            :seed 42}))
-
-(defonce eval-data
-  (mx-io/image-record-iter {:path-imgrec "captcha_example/captcha_test.rec"
-                            :path-imglist "captcha_example/captcha_test.lst"
-                            :batch-size batch-size
-                            :label-width label-width
-                            :data-shape data-shape}))
-
-(defn accuracy
-  [label pred & {:keys [by-character]
-                 :or {by-character false} :as opts}]
-  (let [[nr nc] (ndarray/shape-vec label)
-        pred-context (ndarray/context pred)
-        label-t (-> label
-                    ndarray/transpose
-                    (ndarray/reshape [-1])
-                    (ndarray/as-in-context pred-context))
-        pred-label (ndarray/argmax pred 1)
-        matches (ndarray/equal label-t pred-label)
-        [digit-matches] (-> matches
-                            ndarray/sum
-                            ndarray/->vec)
-        [complete-matches] (-> matches
-                               (ndarray/reshape [nc nr])
-                               (ndarray/sum 0)
-                               (ndarray/equal label-width)
-                               ndarray/sum
-                               ndarray/->vec)]
-    (if by-character
-      (float (/ digit-matches nr nc))
-      (float (/ complete-matches nr)))))
-
-(defn get-data-symbol
-  []
-  (let [data (sym/variable "data")
-        ;; normalize the input pixels
-        scaled (sym/div (sym/- data 127) 128)
-
-        conv1 (sym/convolution {:data scaled :kernel [5 5] :num-filter 32})
-        pool1 (sym/pooling {:data conv1 :pool-type "max" :kernel [2 2] :stride [1 1]})
-        relu1 (sym/activation {:data pool1 :act-type "relu"})
-
-        conv2 (sym/convolution {:data relu1 :kernel [5 5] :num-filter 32})
-        pool2 (sym/pooling {:data conv2 :pool-type "avg" :kernel [2 2] :stride [1 1]})
-        relu2 (sym/activation {:data pool2 :act-type "relu"})
-
-        conv3 (sym/convolution {:data relu2 :kernel [3 3] :num-filter 32})
-        pool3 (sym/pooling {:data conv3 :pool-type "avg" :kernel [2 2] :stride [1 1]})
-        relu3 (sym/activation {:data pool3 :act-type "relu"})
-
-        conv4 (sym/convolution {:data relu3 :kernel [3 3] :num-filter 32})
-        pool4 (sym/pooling {:data conv4 :pool-type "avg" :kernel [2 2] :stride [1 1]})
-        relu4 (sym/activation {:data pool4 :act-type "relu"})
-
-        flattened (sym/flatten {:data relu4})
-        fc1 (sym/fully-connected {:data flattened :num-hidden 256})
-        fc21 (sym/fully-connected {:data fc1 :num-hidden num-labels})
-        fc22 (sym/fully-connected {:data fc1 :num-hidden num-labels})
-        fc23 (sym/fully-connected {:data fc1 :num-hidden num-labels})
-        fc24 (sym/fully-connected {:data fc1 :num-hidden num-labels})]
-    (sym/concat "concat" nil [fc21 fc22 fc23 fc24] {:dim 0})))
-
-(defn get-label-symbol
-  []
-  (as-> (sym/variable "label") label
-    (sym/transpose {:data label})
-    (sym/reshape {:data label :shape [-1]})))
-
-(defn create-captcha-net
-  []
-  (let [scores (get-data-symbol)
-        labels (get-label-symbol)]
-    (sym/softmax-output {:data scores :label labels})))
-
-(def optimizer
-  (optimizer/adam
-   {:learning-rate 0.0002
-    :wd 0.00001
-    :clip-gradient 10}))
-
-(defn train-ocr
-  [devs]
-  (println "Starting the captcha training ...")
-  (let [model (m/module
-               (create-captcha-net)
-               {:data-names ["data"] :label-names ["label"]
-                :contexts devs})]
-    (m/fit model {:train-data train-data
-                  :eval-data eval-data
-                  :num-epoch 10
-                  :fit-params (m/fit-params
-                               {:kvstore "local"
-                                :batch-end-callback
-                                (callback/speedometer batch-size 100)
-                                :initializer
-                                (initializer/xavier {:factor-type "in"
-                                                     :magnitude 2.34})
-                                :optimizer optimizer
-                                :eval-metric (eval-metric/custom-metric
-                                              #(accuracy %1 %2)
-                                              "accuracy")})})
-    (println "Finished the fit")
-    model))
-
-(defn -main
-  [& args]
-  (let [[dev dev-num] args
-        num-devices (Integer/parseInt (or dev-num "1"))
-        devs (if (= dev ":gpu")
-               (mapv #(context/gpu %) (range num-devices))
-               (mapv #(context/cpu %) (range num-devices)))
-        model (train-ocr devs)]
-    (m/save-checkpoint model {:prefix model-prefix :epoch 0})))
diff --git a/contrib/clojure-package/examples/captcha/test/captcha/train_ocr_test.clj b/contrib/clojure-package/examples/captcha/test/captcha/train_ocr_test.clj
deleted file mode 100644
index ab785f7fedf2..000000000000
--- a/contrib/clojure-package/examples/captcha/test/captcha/train_ocr_test.clj
+++ /dev/null
@@ -1,119 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns captcha.train-ocr-test
-  (:require [clojure.test :refer :all]
-            [captcha.consts :refer :all]
-            [captcha.train-ocr :refer :all]
-            [org.apache.clojure-mxnet.io :as mx-io]
-            [org.apache.clojure-mxnet.module :as m]
-            [org.apache.clojure-mxnet.ndarray :as ndarray]
-            [org.apache.clojure-mxnet.shape :as shape]
-            [org.apache.clojure-mxnet.util :as util]))
-
-(deftest test-consts
-  (is (= 8 batch-size))
-  (is (= [3 30 80] data-shape))
-  (is (= 4 label-width))
-  (is (= 10 num-labels)))
-
-(deftest test-labeled-data
-  (let [train-batch (mx-io/next train-data)
-        eval-batch (mx-io/next eval-data)
-        allowed-labels (into #{} (map float (range 10)))]
-    (is (= 8 (-> train-batch mx-io/batch-index count)))
-    (is (= 8 (-> eval-batch mx-io/batch-index count)))
-    (is (= [8 3 30 80] (-> train-batch
-                           mx-io/batch-data
-                           first
-                           ndarray/shape-vec)))
-    (is (= [8 3 30 80] (-> eval-batch
-                           mx-io/batch-data
-                           first
-                           ndarray/shape-vec)))
-    (is (every? #(<= 0 % 255) (-> train-batch
-                                  mx-io/batch-data
-                                  first
-                                  ndarray/->vec)))
-    (is (every? #(<= 0 % 255) (-> eval-batch
-                                  mx-io/batch-data
-                                  first
-                                  ndarray/->vec)))
-    (is (= [8 4] (-> train-batch
-                     mx-io/batch-label
-                     first
-                     ndarray/shape-vec)))
-    (is (= [8 4] (-> eval-batch
-                     mx-io/batch-label
-                     first
-                     ndarray/shape-vec)))
-    (is (every? allowed-labels (-> train-batch
-                                   mx-io/batch-label
-                                   first
-                                   ndarray/->vec)))
-    (is (every? allowed-labels (-> eval-batch
-                                   mx-io/batch-label
-                                   first
-                                   ndarray/->vec)))))
-
-(deftest test-model
-  (let [batch (mx-io/next train-data)
-        model (m/module (create-captcha-net)
-                        {:data-names ["data"] :label-names ["label"]})
-        _ (m/bind model
-                  {:data-shapes (mx-io/provide-data-desc train-data)
-                   :label-shapes (mx-io/provide-label-desc train-data)})
-        _ (m/init-params model)
-        _ (m/forward-backward model batch)
-        output-shapes (-> model
-                          m/output-shapes
-                          util/coerce-return-recursive)
-        outputs (-> model
-                    m/outputs-merged
-                    first)
-        grads (->> model m/grad-arrays (map first))]
-    (is (= [["softmaxoutput0_output" (shape/->shape [8 10])]]
-           output-shapes))
-    (is (= [32 10] (-> outputs ndarray/shape-vec)))
-    (is (every? #(<= 0.0 % 1.0) (-> outputs ndarray/->vec)))
-    (is (= [[32 3 5 5] [32]   ; convolution1 weights+bias
-            [32 32 5 5] [32]  ; convolution2 weights+bias
-            [32 32 3 3] [32]  ; convolution3 weights+bias
-            [32 32 3 3] [32]  ; convolution4 weights+bias
-            [256 28672] [256] ; fully-connected1 weights+bias
-            [10 256] [10]     ; 1st label scores
-            [10 256] [10]     ; 2nd label scores
-            [10 256] [10]     ; 3rd label scores
-            [10 256] [10]]    ; 4th label scores
-           (map ndarray/shape-vec grads)))))
-
-(deftest test-accuracy
-  (let [labels (ndarray/array [1 2 3 4,
-                               5 6 7 8]
-                              [2 4])
-        pred-labels (ndarray/array [1 0,
-                                    2 6,
-                                    3 0,
-                                    4 8]
-                                   [8])
-        preds (ndarray/one-hot pred-labels 10)]
-    (is (float? (accuracy labels preds)))
-    (is (float? (accuracy labels preds :by-character false)))
-    (is (float? (accuracy labels preds :by-character true)))
-    (is (= 0.5 (accuracy labels preds)))
-    (is (= 0.5 (accuracy labels preds :by-character false)))
-    (is (= 0.75 (accuracy labels preds :by-character true)))))
diff --git a/contrib/clojure-package/examples/cnn-text-classification/.gitignore b/contrib/clojure-package/examples/cnn-text-classification/.gitignore
deleted file mode 100644
index c53038ec0e3d..000000000000
--- a/contrib/clojure-package/examples/cnn-text-classification/.gitignore
+++ /dev/null
@@ -1,11 +0,0 @@
-/target
-/classes
-/checkouts
-pom.xml
-pom.xml.asc
-*.jar
-*.class
-/.lein-*
-/.nrepl-port
-.hgignore
-.hg/
diff --git a/contrib/clojure-package/examples/cnn-text-classification/README.md b/contrib/clojure-package/examples/cnn-text-classification/README.md
deleted file mode 100644
index 152ee4f10189..000000000000
--- a/contrib/clojure-package/examples/cnn-text-classification/README.md
+++ /dev/null
@@ -1,97 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# cnn-text-classification
-
-An example of text classification using CNN
-
-To use you must download the MR polarity dataset and put it in the path specified in the mr-dataset-path
-The dataset can be obtained here: [CNN_sentence](https://github.com/yoonkim/CNN_sentence). The two files `rt-polarity.neg`
-and `rt-polarity.pos` must be put in a directory. For example, `data/mr-data/rt-polarity.neg`.
-
-You also must download the glove word embeddings. The suggested one to use is the smaller 50 dimension one
-`glove.6B.50d.txt` which is contained in the download file here: [GloVe](https://nlp.stanford.edu/projects/glove/)
-
-
-## Installation
-
-Before you run this example, make sure that you have the clojure package installed.
-In the main clojure package directory, do `lein install`. Then you can run
-`lein install` in this directory.
-
-## Usage
-
-You can run through the repl with
-`(train-convnet {:devs [(context/default-context)] :embedding-size 50 :batch-size 100 :test-size 100 :num-epoch 10 :max-examples 1000 :pretrained-embedding :glove})`
-or
-`JVM_OPTS="-Xmx1g" lein run` (cpu)
-
-You can control the devices you run on by doing:
-
-`lein run :cpu 2` - This will run on 2 cpu devices
-`lein run :gpu 1` - This will run on 1 gpu device
-`lein run :gpu 2` - This will run on 2 gpu devices
-
-
-The max-examples only loads 1000 each of the dataset to keep the time and memory down. To run all the examples,
-change the main to be (train-convnet {:embedding-size 50 :batch-size 100 :test-size 1000 :num-epoch 10 :pretrained-embedding :glove})
-
-and then run
-
-- `lein uberjar`
-- `java -Xms1024m -Xmx2048m -jar target/cnn-text-classification-0.1.0-SNAPSHOT-standalone.jar`
-
-## Usage with fastText
-
-Using fastText instead of glove is fairly straightforward, as the pretrained embedding format is very similar.
-
-Download the 'Simple English' pretrained wiki word vectors (text) from the fastText
-[site](https://fasttext.cc/docs/en/pretrained-vectors.html) and place them in the
-`data/fasttext` directory. Alternatively just run `./get_fasttext_data.sh`.
-
-Then you can run training on a subset of examples through the repl using:
-```
-(train-convnet {:devs [(context/default-context)] :embedding-size 300 :batch-size 100 :test-size 100 :num-epoch 10 :max-examples 1000 :pretrained-embedding :fasttext})
-```
-
-Expect a validation accuracy of `~0.67` with the above parameters.
-
-## Usage with word2vec
-
-You can also use word2vec embeddings in order to train the text classification model.
-Before training, you will need to download [GoogleNews-vectors-negative300.bin](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing) first.
-Once you've downloaded the embeddings (which are in a gzipped format),
-you'll need to unzip them and place them in the `contrib/clojure-package/data` directory.
-
-Then you can run training on a subset of examples through the repl using:
-```
-(train-convnet {:devs [(context/default-context)] :embedding-size 300 :batch-size 100 :test-size 100 :num-epoch 10 :max-examples 1000 :pretrained-embedding :word2vec})
-```
-Note that loading word2vec embeddings consumes memory and takes some time.
-
-You can also train them using `JVM_OPTS="-Xmx8g" lein run` once you've modified
-the parameters to `train-convnet` (see above) in `src/cnn_text_classification/classifier.clj`.
-In order to run training with word2vec on the complete data set, you will need to run:
-```
-(train-convnet {:devs [(context/default-context)] :embedding-size 300 :batch-size 100 :test-size 1000 :num-epoch 10 :pretrained-embedding :word2vec})
-```
-You should be able to achieve an accuracy of `~0.78` using the parameters above.
-
-## Usage with learned embeddings
-
-Lastly, similar to the python CNN text classification example, you can learn the embeddings based on training data.
-This can be achieved by setting `:pretrained-embedding nil` (or omitting that parameter altogether).
diff --git a/contrib/clojure-package/examples/cnn-text-classification/get_data.sh b/contrib/clojure-package/examples/cnn-text-classification/get_data.sh
deleted file mode 100755
index 7bbd9ce72142..000000000000
--- a/contrib/clojure-package/examples/cnn-text-classification/get_data.sh
+++ /dev/null
@@ -1,31 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -evx
-
-mkdir -p data/mr-data
-cd data/mr-data
-wget https://raw.githubusercontent.com/yoonkim/CNN_sentence/master/rt-polarity.neg
-wget https://raw.githubusercontent.com/yoonkim/CNN_sentence/master/rt-polarity.pos
-cd ../..
-mkdir -p data/glove
-cd data/glove
-wget http://nlp.stanford.edu/data/glove.6B.zip
-unzip *.zip
-cd ../..
diff --git a/contrib/clojure-package/examples/cnn-text-classification/get_fasttext_data.sh b/contrib/clojure-package/examples/cnn-text-classification/get_fasttext_data.sh
deleted file mode 100755
index 2bfe96659402..000000000000
--- a/contrib/clojure-package/examples/cnn-text-classification/get_fasttext_data.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -evx
-
-mkdir -p data/fasttext
-cd data/fasttext
-wget https://dl.fbaipublicfiles.com/fasttext/vectors-wiki/wiki.simple.vec
diff --git a/contrib/clojure-package/examples/cnn-text-classification/project.clj b/contrib/clojure-package/examples/cnn-text-classification/project.clj
deleted file mode 100644
index 5235027a71fb..000000000000
--- a/contrib/clojure-package/examples/cnn-text-classification/project.clj
+++ /dev/null
@@ -1,24 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(defproject cnn-text-classification "0.1.0-SNAPSHOT"
-  :description "CNN text classification with MXNet"
-  :plugins [[lein-cljfmt "0.5.7"]]
-  :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "2.0.0-SNAPSHOT"]]
-  :pedantic? :skip
-  :main cnn-text-classification.classifier)
diff --git a/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/classifier.clj b/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/classifier.clj
deleted file mode 100644
index 3c0288c9c343..000000000000
--- a/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/classifier.clj
+++ /dev/null
@@ -1,135 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns cnn-text-classification.classifier
-  (:require [clojure.java.io :as io]
-            [clojure.java.shell :refer [sh]]
-            [cnn-text-classification.data-helper :as data-helper]
-            [org.apache.clojure-mxnet.eval-metric :as eval-metric]
-            [org.apache.clojure-mxnet.io :as mx-io]
-            [org.apache.clojure-mxnet.module :as m]
-            [org.apache.clojure-mxnet.ndarray :as ndarray]
-            [org.apache.clojure-mxnet.optimizer :as optimizer]
-            [org.apache.clojure-mxnet.symbol :as sym]
-            [org.apache.clojure-mxnet.context :as context])
-  (:gen-class))
-
-(def data-dir "data/")
-(def mr-dataset-path "data/mr-data") ;; the MR polarity dataset path
-(def num-filter 100)
-(def num-label 2)
-(def dropout 0.5)
-
-(when-not (.exists (io/file (str data-dir)))
-  (do (println "Retrieving data for cnn text classification...") (sh "./get_data.sh")))
-
-(defn shuffle-data [test-num {:keys [data label sentence-count sentence-size vocab-size embedding-size pretrained-embedding]}]
-  (println "Shuffling the data and splitting into training and test sets")
-  (println {:sentence-count sentence-count
-            :sentence-size sentence-size
-            :vocab-size vocab-size
-            :embedding-size embedding-size
-            :pretrained-embedding pretrained-embedding})
-  (let [shuffled (shuffle (map #(vector %1 %2) data label))
-        train-num (- (count shuffled) test-num)
-        training (into [] (take train-num shuffled))
-        test (into [] (drop train-num shuffled))
-        ;; has to be channel x y
-        train-data-shape (if pretrained-embedding
-                           [train-num 1 sentence-size embedding-size]
-                           [train-num 1 sentence-size])
-        ;; has to be channel x y
-        test-data-shape (if pretrained-embedding
-                           [test-num 1 sentence-size embedding-size]
-                           [test-num 1 sentence-size])]
-    {:training {:data  (ndarray/array (into [] (flatten (mapv first training)))
-                                      train-data-shape)
-                :label (ndarray/array (into [] (flatten (mapv last  training)))
-                                      [train-num])}
-     :test {:data  (ndarray/array (into [] (flatten (mapv first test)))
-                                  test-data-shape)
-            :label (ndarray/array (into [] (flatten (mapv last  test)))
-                                  [test-num])}}))
-
-(defn get-data-symbol [num-embed sentence-size batch-size vocab-size pretrained-embedding]
-  (if pretrained-embedding
-    (sym/variable "data")
-    (as-> (sym/variable "data") data
-      (sym/embedding "vocab_embed" {:data data :input-dim vocab-size :output-dim num-embed})
-      (sym/reshape {:data data :target-shape [batch-size 1 sentence-size num-embed]}))))
-
-(defn make-filter-layers [{:keys [input-x num-embed sentence-size] :as config}
-                          filter-size]
-  (as-> (sym/convolution {:data input-x
-                          :kernel [filter-size num-embed]
-                          :num-filter num-filter}) data
-    (sym/activation {:data data :act-type "relu"})
-    (sym/pooling {:data data
-                  :pool-type "max"
-                  :kernel [(inc (- sentence-size filter-size)) 1]
-                  :stride [1 1]})))
-
-;;; convnet with multiple filter sizes
-;; from Convolutional Neural Networks for Sentence Classification by Yoon Kim
-(defn get-multi-filter-convnet [num-embed sentence-size batch-size vocab-size pretrained-embedding]
-  (let [filter-list [3 4 5]
-        input-x (get-data-symbol num-embed sentence-size batch-size vocab-size pretrained-embedding)
-        polled-outputs (mapv #(make-filter-layers {:input-x input-x :num-embed num-embed :sentence-size sentence-size} %) filter-list)
-        total-filters (* num-filter (count filter-list))
-        concat (sym/concat "concat" nil polled-outputs {:dim 1})
-        hpool (sym/reshape "hpool" {:data concat :target-shape [batch-size total-filters]})
-        hdrop (if (pos? dropout) (sym/dropout "hdrop" {:data hpool :p dropout}) hpool)
-        fc (sym/fully-connected  "fc1" {:data hdrop :num-hidden num-label})]
-    (sym/softmax-output "softmax" {:data fc})))
-
-(defn train-convnet [{:keys [devs embedding-size batch-size test-size
-                             num-epoch max-examples pretrained-embedding]}]
-  (let [ms-dataset (data-helper/load-ms-with-embeddings mr-dataset-path max-examples embedding-size {:pretrained-embedding pretrained-embedding})
-        sentence-size (:sentence-size ms-dataset)
-        vocab-size (:vocab-size ms-dataset)
-        shuffled (shuffle-data test-size ms-dataset)
-        train-data (mx-io/ndarray-iter [(get-in shuffled [:training :data])]
-                                       {:label [(get-in shuffled [:training :label])]
-                                        :label-name "softmax_label"
-                                        :data-batch-size batch-size
-                                        :last-batch-handle "pad"})
-        test-data (mx-io/ndarray-iter [(get-in shuffled [:test :data])]
-                                      {:label [(get-in  shuffled [:test :label])]
-                                       :label-name "softmax_label"
-                                       :data-batch-size batch-size
-                                       :last-batch-handle "pad"})]
-    (let [mod (m/module (get-multi-filter-convnet embedding-size sentence-size batch-size vocab-size pretrained-embedding) {:contexts devs})]
-      (println "Getting ready to train for " num-epoch " epochs")
-      (println "===========")
-      (m/fit mod {:train-data train-data :eval-data test-data :num-epoch num-epoch
-                  :fit-params (m/fit-params {:optimizer (optimizer/adam)})}))))
-
-(defn -main [& args]
-  (let [[dev dev-num] args
-        devs (if (= dev ":gpu")
-               (mapv #(context/gpu %) (range (Integer/parseInt (or dev-num "1"))))
-               (mapv #(context/cpu %) (range (Integer/parseInt (or dev-num "1")))))]
-  ;;; omit max-examples if you want to run all the examples in the movie review dataset
-    ;; to limit mem consumption set to something like 1000 and adjust test size to 100
-    (println "Running with context devices of" devs)
-    (train-convnet {:devs devs :embedding-size 50 :batch-size 10 :test-size 100 :num-epoch 10 :max-examples 1000 :pretrained-embedding :glove})
-    ;; runs all the examples
-    #_(train-convnet {:embedding-size 50 :batch-size 100 :test-size 1000 :num-epoch 10})))
-
-(comment
-  (train-convnet {:devs devs :embedding-size 50 :batch-size 10 :test-size 100 :num-epoch 10 :max-examples 1000}))
-
diff --git a/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj b/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj
deleted file mode 100644
index df132c3167cd..000000000000
--- a/contrib/clojure-package/examples/cnn-text-classification/src/cnn_text_classification/data_helper.clj
+++ /dev/null
@@ -1,222 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns cnn-text-classification.data-helper
-  (:require [clojure.java.io :as io]
-            [clojure.string :as string]
-            [org.apache.clojure-mxnet.context :as context]
-            [org.apache.clojure-mxnet.ndarray :as ndarray]
-            [org.apache.clojure-mxnet.random :as random])
-  (:import (java.io DataInputStream)
-           (java.nio ByteBuffer ByteOrder))
-  (:gen-class))
-
-(def w2v-file-path "../../data/GoogleNews-vectors-negative300.bin") ;; the word2vec file path
-(def EOS "</s>")  ;; end of sentence word
-
-(defn glove-file-path
-  "Returns the file path to GloVe embedding of the input size"
-  [embedding-size]
-  (format "data/glove/glove.6B.%dd.txt" embedding-size))
-
-(def fasttext-file-path "data/fasttext/wiki.simple.vec")
-
-(defn r-string
-  "Reads a string from the given DataInputStream `dis` until a space or newline is reached."
-  [dis]
-  (loop [b (.readByte dis)
-         bs []]
-    (if (and (not= 32 b) (not= 10 b))
-      (recur (.readByte dis) (conj bs b))
-      (new String (byte-array bs)))))
-
-(defn get-float [bs]
-  (-> (ByteBuffer/wrap bs)
-      (.order ByteOrder/LITTLE_ENDIAN)
-      (.getFloat)))
-
-(defn read-float [is]
-  (let [bs (byte-array 4)]
-    (do (.read is bs)
-        (get-float bs))))
-
-(defn- load-w2v-vectors
-  "Lazily loads the word2vec vectors given a data input stream `dis`,
-  number of words `nwords` and dimensionality `embedding-size`."
-  [dis embedding-size num-vectors]
-  (if (= 0 num-vectors)
-    (list)
-    (let [word (r-string dis)
-          vect (mapv (fn [_] (read-float dis)) (range embedding-size))]
-      (cons [word vect] (lazy-seq (load-w2v-vectors dis embedding-size (dec num-vectors)))))))
-
-(defn load-word2vec-model!
-  "Loads the word2vec model stored in a binary format from the given `path`.
-  By default only the first 100 embeddings are loaded."
-  ([path embedding-size opts]
-   (println "Loading the word2vec model from binary ...")
-   (with-open [bis (io/input-stream path)
-               dis (new DataInputStream bis)]
-     (let [word-size (Integer/parseInt (r-string dis))
-           dim  (Integer/parseInt (r-string dis))
-           {:keys [max-vectors vocab] :or {max-vectors word-size}} opts
-           _  (println "Processing with " {:dim dim :word-size word-size} " loading max vectors " max-vectors)
-           _ (if (not= embedding-size dim)
-               (throw (ex-info "Mismatch in embedding size"
-                       {:input-embedding-size embedding-size
-                        :word2vec-embedding-size dim})))
-           vectors (load-w2v-vectors dis dim max-vectors)
-           word2vec (if vocab
-                      (->> vectors
-                           (filter (fn [[w _]] (contains? vocab w)))
-                           (into {}))
-                      (->> vectors
-                           (take max-vectors)
-                           (into {})))]
-       (println "Finished")
-       {:num-embed dim :word2vec word2vec})))
-  ([path embedding-size]
-   (load-word2vec-model! path embedding-size {:max-vectors 100})))
-
-(defn read-text-embedding-pairs [pairs]
-  (for [^String line pairs
-        :let [fields (.split line " ")]]
-    [(aget fields 0)
-     (mapv #(Float/parseFloat ^String %) (rest fields))]))
-
-(defn load-glove! [glove-file-path]
-  (println "Loading the glove pre-trained word embeddings from " glove-file-path)
-  (->> (io/reader glove-file-path)
-       line-seq
-       read-text-embedding-pairs
-       (into {})))
-
-(def remove-fasttext-metadata rest)
-
-(defn load-fasttext! [fasttext-file-path]
-  (println "Loading the fastText pre-trained word embeddings from " fasttext-file-path)
-  (->> (io/reader fasttext-file-path)
-       line-seq
-       remove-fasttext-metadata
-       read-text-embedding-pairs
-       (into {})))
-
-(defn clean-str [s]
-  (-> s
-      (string/replace #"^A-Za-z0-9(),!?'`]" " ")
-      (string/replace #"'s" " 's")
-      (string/replace #"'ve" " 've")
-      (string/replace #"n't" " n't")
-      (string/replace #"'re" " 're")
-      (string/replace #"'d" " 'd")
-      (string/replace #"'ll" " 'll")
-      (string/replace #"," " , ")
-      (string/replace #"!" " ! ")
-      (string/replace #"\(" " ( ")
-      (string/replace #"\)" " ) ")
-      (string/replace #"\?" " ? ")
-      (string/replace #" {2,}" " ")
-      (string/trim)))
-
-(defn load-mr-data-and-labels
-  "Loads MR polarity data from files, splits the data into words and generates labels. 
-  Returns split sentences and labels."
-  [path max-examples]
-  (println "Loading all the movie reviews from " path)
-  (let [positive-examples (mapv #(string/trim %) (-> (slurp (str path "/rt-polarity.pos"))
-                                                     (string/split #"\n")))
-        negative-examples (mapv #(string/trim %) (-> (slurp (str path "/rt-polarity.neg"))
-                                                     (string/split #"\n")))
-        positive-examples (into [] (if max-examples (take max-examples positive-examples) positive-examples))
-        negative-examples (into [] (if max-examples (take max-examples negative-examples) negative-examples))
-        ;; split by words
-        x-text (->> (into positive-examples negative-examples)
-                    (mapv clean-str)
-                    (mapv #(string/split % #" ")))
-
-        ;; generate labels
-        positive-labels (mapv (constantly 1) positive-examples)
-        negative-labels (mapv (constantly 0) negative-examples)]
-    {:sentences x-text :labels (into positive-labels negative-labels)}))
-
-(defn pad-sentences
-  "Pads all sentences to the same length where the length is defined by the longest sentence. Returns padded sentences."
-  [sentences]
-  (let [padding-word EOS
-        sequence-len (apply max (mapv count sentences))]
-    (mapv (fn [s] (let [diff (- sequence-len (count s))]
-                    (if (pos? diff)
-                      (into s (repeat diff padding-word))
-                      s)))
-          sentences)))
-
-(defn build-vocab-embeddings
-  "Returns the subset of `embeddings` for words from the `vocab`.
-  Embeddings for words not in the vocabulary are initialized randomly
-  from a uniform distribution."
-  [vocab embedding-size embeddings]
-  (into {}
-        (mapv (fn [[word _]]
-                [word (or (get embeddings word)
-                          (ndarray/->vec (random/uniform -0.25 0.25 [embedding-size])))])
-              vocab)))
-
-(defn build-input-data-with-embeddings
-  "Map sentences and labels to vectors based on a pretrained embeddings."
-  [sentences embeddings]
-  (mapv (fn [sent] (mapv #(embeddings %) sent)) sentences))
-
-(defn build-vocab
-  "Creates a vocabulary for the data set based on frequency of words.
-  Returns a map from words to unique indices."
-  [sentences]
-  (let [words (flatten sentences)
-        wc (reduce
-            (fn [m w] (update-in m [w] (fnil inc 0)))
-            {}
-            words)
-        sorted-wc (sort-by second > wc)
-        sorted-w (map first sorted-wc)]
-    (into {} (map vector sorted-w (range (count sorted-w))))))
-
-(defn load-ms-with-embeddings
-  "Loads the movie review sentences data set for the given
-  `:pretrained-embedding` (e.g. `nil`, `:glove` or `:word2vec`)"
-  [path max-examples embedding-size {:keys [pretrained-embedding]
-                                     :or {pretrained-embedding nil}
-                                     :as opts}]
-  (let [{:keys [sentences labels]} (load-mr-data-and-labels path max-examples)
-        sentences-padded  (pad-sentences sentences)
-        vocab (build-vocab sentences-padded)
-        vocab-embeddings (case pretrained-embedding
-                           :glove (->> (load-glove! (glove-file-path embedding-size))
-                                       (build-vocab-embeddings vocab embedding-size))
-                           :fasttext (->> (load-fasttext! fasttext-file-path)
-                                          (build-vocab-embeddings vocab embedding-size))
-                           :word2vec (->> (load-word2vec-model! w2v-file-path embedding-size {:vocab vocab})
-                                          (:word2vec)
-                                          (build-vocab-embeddings vocab embedding-size))
-                           vocab)
-        data (build-input-data-with-embeddings sentences-padded vocab-embeddings)]
-    {:data data
-     :label labels
-     :sentence-count (count data)
-     :sentence-size (count (first data))
-     :embedding-size embedding-size
-     :vocab-size (count vocab)
-     :pretrained-embedding pretrained-embedding}))
-
diff --git a/contrib/clojure-package/examples/cnn-text-classification/test/cnn_text_classification/classifier_test.clj b/contrib/clojure-package/examples/cnn-text-classification/test/cnn_text_classification/classifier_test.clj
deleted file mode 100644
index 744307e3e363..000000000000
--- a/contrib/clojure-package/examples/cnn-text-classification/test/cnn_text_classification/classifier_test.clj
+++ /dev/null
@@ -1,48 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns cnn-text-classification.classifier-test
-  (:require [clojure.test :refer :all]
-            [org.apache.clojure-mxnet.module :as module]
-            [org.apache.clojure-mxnet.ndarray :as ndarray]
-            [org.apache.clojure-mxnet.util :as util]
-            [org.apache.clojure-mxnet.context :as context]
-            [cnn-text-classification.classifier :as classifier]))
-
-(deftest classifier-with-embeddings-test
-  (let [train (classifier/train-convnet
-               {:devs [(context/default-context)]
-                :embedding-size 50
-                :batch-size 10
-                :test-size 100
-                :num-epoch 1
-                :max-examples 1000
-                :pretrained-embedding :glove})]
-    (is (= ["data"] (util/scala-vector->vec (module/data-names train))))
-    (is (= 20 (count (ndarray/->vec (-> train module/outputs ffirst)))))))
-
-(deftest classifier-without-embeddings-test
-  (let [train (classifier/train-convnet
-               {:devs [(context/default-context)]
-                :embedding-size 50
-                :batch-size 10
-                :test-size 100
-                :num-epoch 1
-                :max-examples 1000
-                :pretrained-embedding nil})]
-    (is (= ["data"] (util/scala-vector->vec (module/data-names train))))
-    (is (= 20 (count (ndarray/->vec (-> train module/outputs ffirst)))))))
diff --git a/contrib/clojure-package/examples/gan/.gitignore b/contrib/clojure-package/examples/gan/.gitignore
deleted file mode 100644
index ea8013148d11..000000000000
--- a/contrib/clojure-package/examples/gan/.gitignore
+++ /dev/null
@@ -1,12 +0,0 @@
-/target
-/classes
-/checkouts
-pom.xml
-pom.xml.asc
-*.jar
-*.class
-/.lein-*
-/.nrepl-port
-.hgignore
-.hg/
-results
\ No newline at end of file
diff --git a/contrib/clojure-package/examples/gan/README.md b/contrib/clojure-package/examples/gan/README.md
deleted file mode 100644
index 30ecafb266db..000000000000
--- a/contrib/clojure-package/examples/gan/README.md
+++ /dev/null
@@ -1,40 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# gan
-
-This is an example of how to do a GAN with the MNIST data
-
-## Installation
-
-Before you run this example, make sure that you have the clojure package installed.
-In the main clojure package directory, do `lein install`. Then you can run
-`lein install` in this directory.
-
-
-## Usage
-
-Do `lein run` and the images generated will be in the `results` directory. The gout* images are the ones generated, the diff* images are the visualization of the input gradient different fed to the generator
-
-`lein run :gpu` will run on gpu
-
-If you are running on AWS you will need to setup X11 for graphics
-`sudo apt install xauth x11-apps`
-
-then relogin in `ssh -X -i creds ubuntu@yourinstance`
-
-
diff --git a/contrib/clojure-package/examples/gan/project.clj b/contrib/clojure-package/examples/gan/project.clj
deleted file mode 100644
index 7ec497787680..000000000000
--- a/contrib/clojure-package/examples/gan/project.clj
+++ /dev/null
@@ -1,26 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(defproject gan-origami "0.1.0-SNAPSHOT"
-  :description "GAN MNIST with MXNet"
-  :plugins [[lein-cljfmt "0.5.7"]]
-  :repositories [["vendredi" {:url "https://repository.hellonico.info/repository/hellonico/"}]]
-  :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "2.0.0-SNAPSHOT"]
-                 [origami "4.0.0-3"]
-                 ]
-  :main gan.gan-mnist)
diff --git a/contrib/clojure-package/examples/gan/src/gan/gan_mnist.clj b/contrib/clojure-package/examples/gan/src/gan/gan_mnist.clj
deleted file mode 100644
index 944791bce604..000000000000
--- a/contrib/clojure-package/examples/gan/src/gan/gan_mnist.clj
+++ /dev/null
@@ -1,219 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns gan.gan-mnist
-  (:require [clojure.java.io :as io]
-            [clojure.java.shell :refer [sh]]
-            [org.apache.clojure-mxnet.executor :as executor]
-            [org.apache.clojure-mxnet.eval-metric :as eval-metric]
-            [org.apache.clojure-mxnet.io :as mx-io]
-            [org.apache.clojure-mxnet.initializer :as init]
-            [org.apache.clojure-mxnet.module :as m]
-            [org.apache.clojure-mxnet.ndarray :as ndarray]
-            [org.apache.clojure-mxnet.optimizer :as opt]
-            [org.apache.clojure-mxnet.symbol :as sym]
-            [org.apache.clojure-mxnet.shape :as mx-shape]
-            [org.apache.clojure-mxnet.util :as util]
-            [gan.viz :as viz]
-            [org.apache.clojure-mxnet.context :as context])
-  (:gen-class))
-
-;; based off of https://medium.com/@julsimon/generative-adversarial-networks-on-apache-mxnet-part-1-b6d39e6b5df1
-
-
-(def data-dir "data/")
-(def output-path "results/")
-(def batch-size 100)
-(def num-epoch 10)
-
-(io/make-parents (str output-path "gout"))
-
-(when-not (.exists (io/file (str data-dir "train-images-idx3-ubyte")))
-  (sh "../../scripts/get_mnist_data.sh"))
-
-(defonce mnist-iter (mx-io/mnist-iter {:image (str data-dir "train-images-idx3-ubyte")
-                                       :label (str data-dir "train-labels-idx1-ubyte")
-                                       :input-shape [1 28 28]
-                                       :batch-size batch-size
-                                       :shuffle true}))
-
-(def rand-noise-iter (mx-io/rand-iter [batch-size 100 1 1]))
-
-(comment
-
-  ;;This is for figuring out the convolution and deconvolution layers to convert the image sizes
-
-  (defn conv-output-size [input-size kernel-size padding stride]
-    (float (inc (/ (- (+ input-size (* 2 padding)) kernel-size) stride))))
-
-  ;; Calcing the layer sizes for discriminator
-  (conv-output-size 28 4 3 2) ;=> 16
-  (conv-output-size 16 4 1 2) ;=> 8
-  (conv-output-size 8 4 1 2) ;=> 4.0
-  (conv-output-size 4 4 0 1) ;=> 1
-
-  ;; Calcing the layer sizes for generator
-  (defn deconv-output-size [input-size kernel-size padding stride]
-    (-
-     (+ (* stride (- input-size 1))
-        kernel-size)
-     (* 2 padding)))
-
-  (deconv-output-size 1 4 0 1) ;=> 4
-  (deconv-output-size 4 4 1 2) ;=> 8
-  (deconv-output-size 8 4 1 2) ;=> 16
-  (deconv-output-size 16 4 3 2)) ;=> 28
-
-
-(def ndf 28) ;; image height /width
-(def nc 1) ;; number of channels
-(def eps (float (+ 1e-5  1e-12)))
-(def lr  0.0005) ;; learning rate
-(def beta1 0.5)
-
-(def label (sym/variable "label"))
-
-(defn discriminator []
-  (as-> (sym/variable "data") data
-    (sym/convolution "d1" {:data data :kernel [4 4] :pad [3 3] :stride [2 2] :num-filter ndf :no-bias true})
-    (sym/batch-norm "dbn1" {:data data :fix-gamma true :eps eps})
-    (sym/leaky-re-lu "dact1" {:data data :act-type "leaky" :slope 0.2})
-
-    (sym/convolution "d2" {:data data :kernel [4 4] :pad [1 1] :stride [2 2] :num-filter (* 2 ndf) :no-bias true})
-    (sym/batch-norm "dbn2" {:data data :fix-gamma true :eps eps})
-    (sym/leaky-re-lu "dact1" {:data data :act_type "leaky" :slope 0.2})
-
-    (sym/convolution "d3" {:data data :kernel [4 4] :pad [1 1] :stride [2 2] :num-filter (* 3 ndf) :no-bias true})
-    (sym/batch-norm "dbn3" {:data data :fix-gamma true :eps eps})
-    (sym/leaky-re-lu "dact3" {:data data :act_type "leaky" :slope 0.2})
-
-    (sym/convolution "d4" {:data data :kernel [4 4] :pad [0 0] :stride [1 1] :num-filter (* 4 ndf) :no-bias true})
-    (sym/flatten "flt" {:data data})
-
-    (sym/fully-connected "fc" {:data data :num-hidden 1 :no-bias false})
-    (sym/logistic-regression-output "dloss" {:data data :label label})))
-
-(defn generator []
-  (as-> (sym/variable "rand") data
-    (sym/deconvolution "g1" {:data data :kernel [4 4]  :pad [0 0] :stride [1 1] :num-filter (* 4 ndf) :no-bias true})
-    (sym/batch-norm "gbn1" {:data data :fix-gamma true :eps eps})
-    (sym/activation "gact1" {:data data :act-type "relu"})
-
-    (sym/deconvolution "g2" {:data data :kernel [4 4] :pad [1 1] :stride [2 2] :num-filter (* 2 ndf) :no-bias true})
-    (sym/batch-norm "gbn2" {:data data :fix-gamma true :eps eps})
-    (sym/activation "gact2" {:data data :act-type "relu"})
-
-    (sym/deconvolution "g3" {:data data :kernel [4 4] :pad [1 1] :stride [2 2] :num-filter ndf :no-bias true})
-    (sym/batch-norm "gbn3" {:data data :fix-gamma true :eps eps})
-    (sym/activation "gact3" {:data data :act-type "relu"})
-
-    (sym/deconvolution "g4" {:data data :kernel [4 4] :pad [3 3] :stride [2 2] :num-filter nc :no-bias true})
-    (sym/activation "gact4" {:data data :act-type "tanh"})))
-
-(let [data [(ndarray/ones [batch-size 100 1 1])]
-      label [(ndarray/ones [batch-size 100 1 1])]]
-  (def my-iter (mx-io/ndarray-iter data)))
-
-(defn save-img-gout [i n x]
-  (do
-    (viz/im-sav {:title (str "gout-" i "-" n)
-                 :output-path output-path
-                 :x x
-                 :flip false})))
-
-(defn save-img-diff [i n x]
-  (do (viz/im-sav {:title (str "diff-" i "-" n)
-                   :output-path output-path
-                   :x x
-                   :flip false})))
-
-(defn save-img-data [i n batch]
-  (do (viz/im-sav {:title (str "data-" i "-" n)
-                   :output-path output-path
-                   :x (first (mx-io/batch-data batch))
-                   :flip false})))
-
-(defn calc-diff [i n diff-d]
-  (let [diff (ndarray/copy diff-d)
-        arr (ndarray/->vec diff)
-        mean (/ (apply + arr) (count arr))
-        std (let [tmp-a (map #(* (- % mean) (- % mean)) arr)]
-              (float (Math/sqrt (/ (apply + tmp-a) (count tmp-a)))))]
-    (let [calc-diff (ndarray/+ (ndarray/div (ndarray/- diff mean) std) 0.5)]
-
-      (save-img-diff i n calc-diff))))
-
-(defn train 
-  ([devs] (train devs num-epoch))
-  ([devs num-epoch]
-  (let [mod-d  (-> (m/module (discriminator) {:contexts devs :data-names ["data"] :label-names ["label"]})
-                   (m/bind {:data-shapes (mx-io/provide-data-desc mnist-iter)
-                            :label-shapes (mx-io/provide-label-desc mnist-iter)
-                            :inputs-need-grad true})
-                   (m/init-params {:initializer (init/normal 0.02)})
-                   (m/init-optimizer {:optimizer (opt/adam {:learning-rate lr :wd 0.0 :beta1 beta1})}))
-        mod-g (-> (m/module (generator) {:contexts devs :data-names ["rand"] :label-names nil})
-                  (m/bind {:data-shapes (mx-io/provide-data-desc rand-noise-iter)})
-                  (m/init-params {:initializer (init/normal 0.02)})
-                  (m/init-optimizer {:optimizer (opt/adam {:learning-rate lr :wd 0.0 :beta1 beta1})}))]
-
-    (println "Training for " num-epoch " epochs...")
-    (doseq [i (range num-epoch)]
-      (mx-io/reduce-batches mnist-iter
-                            (fn [n batch]
-                              (let [rbatch (mx-io/next rand-noise-iter)
-                                    out-g (-> mod-g
-                                              (m/forward rbatch)
-                                              (m/outputs))
-                                   ;; update the discriminiator on the fake
-                                    grads-f  (mapv #(ndarray/copy (first %)) (-> mod-d
-                                                                                 (m/forward {:data (first out-g) :label [(ndarray/zeros [batch-size])]})
-                                                                                 (m/backward)
-                                                                                 (m/grad-arrays)))
-                                   ;; update the discrimintator on the real
-                                    grads-r (-> mod-d
-                                                (m/forward {:data (mx-io/batch-data batch) :label [(ndarray/ones [batch-size])]})
-                                                (m/backward)
-                                                (m/grad-arrays))
-                                    _ (mapv (fn [real fake] (let [r (first real)]
-                                                              (ndarray/set r (ndarray/+ r fake)))) grads-r grads-f)
-                                    _ (m/update mod-d)
-                                   ;; update the generator
-                                    diff-d (-> mod-d
-                                               (m/forward {:data (first out-g) :label [(ndarray/ones [batch-size])]})
-                                               (m/backward)
-                                               (m/input-grads))
-                                    _ (-> mod-g
-                                          (m/backward (first diff-d))
-                                          (m/update))]
-                                (when (zero? (mod n 100))
-                                  (println "iteration = " i  "number = " n)
-                                  (save-img-gout i n (ndarray/copy (ffirst out-g)))
-                                  (save-img-data i n batch)
-                                  (calc-diff i n (ffirst diff-d)))
-                                (inc n))))))))
-
-(defn -main [& args]
-  (let [[dev dev-num] args
-        devs (if (= dev ":gpu")
-               (mapv #(context/gpu %) (range (Integer/parseInt (or dev-num "1"))))
-               (mapv #(context/cpu %) (range (Integer/parseInt (or dev-num "1")))))]
-    (println "Running with context devices of" devs)
-    (train devs)))
-
-(comment
-  (train [(context/cpu)]))
diff --git a/contrib/clojure-package/examples/gan/src/gan/viz.clj b/contrib/clojure-package/examples/gan/src/gan/viz.clj
deleted file mode 100644
index 08da53cb2382..000000000000
--- a/contrib/clojure-package/examples/gan/src/gan/viz.clj
+++ /dev/null
@@ -1,59 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns gan.viz
-  (:require [org.apache.clojure-mxnet.ndarray :as ndarray]
-            [org.apache.clojure-mxnet.shape :as mx-shape]
-            [org.apache.clojure-mxnet.io :as mx-io]
-            [opencv4.utils :as cvu]
-            [opencv4.core :as cv :refer [CV_8UC1 new-matofbyte flip! imwrite new-size hconcat! vconcat! new-mat merge!]]))
-
-(defn clip [x]
-  (->> x
-       (mapv #(* 255 %))
-       (mapv #(cond
-                (< % 0) 0
-                (> % 255) 255
-                :else (int %)))
-       (mapv #(.byteValue %))))
-
-(defn get-img [raw-data channels height width flip]
-  (let [img (if (> channels 1)
-              (throw (Exception. "Image with 3 channels (RGB) not supported"))
-              ;; gray image
-              (cv/>> (new-mat height width CV_8UC1) (byte-array raw-data)))]
-    (if flip (flip! img 0) img)))
-
-(defn im-sav [{:keys [title output-path x flip]
-               :or {flip false} :as g-mod}]
-  (let [shape (mx-shape/->vec (ndarray/shape x))
-        _ (assert (== 4 (count shape)))
-        [n c h w] shape
-        totals (* h w)
-        raw-data (byte-array (clip (ndarray/to-array x)))
-        row (.intValue (Math/sqrt n))
-        col row
-        line-arrs (into [] (partition (* col c totals) raw-data))
-        line-mats (mapv (fn [line]
-                          (let [img-arr (into [] (partition (* c totals) line))
-                                src (mapv (fn [arr] (get-img (into [] arr) c h w flip)) img-arr)]
-                            (hconcat! src)))
-                        line-arrs)]
-    (-> line-mats
-        (vconcat!)
-        (cvu/resize-by 1.5)
-        (imwrite (str output-path title ".jpg")))))
\ No newline at end of file
diff --git a/contrib/clojure-package/examples/gan/test/gan/gan_test.clj b/contrib/clojure-package/examples/gan/test/gan/gan_test.clj
deleted file mode 100644
index 71b9126cae25..000000000000
--- a/contrib/clojure-package/examples/gan/test/gan/gan_test.clj
+++ /dev/null
@@ -1,25 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns gan.gan_test
- (:require 
- 	[gan.gan-mnist :refer :all]
- 	[org.apache.clojure-mxnet.context :as context]
- 	[clojure.test :refer :all]))
-
-(deftest check-pdf 
-	(train [(context/cpu)] 1))
\ No newline at end of file
diff --git a/contrib/clojure-package/examples/imclassification/.gitignore b/contrib/clojure-package/examples/imclassification/.gitignore
deleted file mode 100644
index c53038ec0e3d..000000000000
--- a/contrib/clojure-package/examples/imclassification/.gitignore
+++ /dev/null
@@ -1,11 +0,0 @@
-/target
-/classes
-/checkouts
-pom.xml
-pom.xml.asc
-*.jar
-*.class
-/.lein-*
-/.nrepl-port
-.hgignore
-.hg/
diff --git a/contrib/clojure-package/examples/imclassification/README.md b/contrib/clojure-package/examples/imclassification/README.md
deleted file mode 100644
index d5ab99ced90d..000000000000
--- a/contrib/clojure-package/examples/imclassification/README.md
+++ /dev/null
@@ -1,46 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# imclassification
-
-This shows off how to do image classification with the module api
-
-There is an example of the high level training api fit and also how to use multiple cpus/gpus
-
-To see more examples of how to use different parts of the module api look at the module example
-
-An example of using the profiler.
-
-## Installation
-
-Before you run this example, make sure that you have the clojure package installed.
-In the main clojure package directory, do `lein install`. Then you can run
-`lein install` in this directory.
-
-
-## Usage
-
-To run the example you must do
-
-* `lein install` in the root of the main project directory
-* cd into this project directory and do `lein run`. This will execute the cpu version.
-
-You can control the devices you run on by doing:
-
-`lein run :cpu 2` - This will run on 2 cpu devices
-`lein run :gpu 1` - This will run on 1 gpu device
-`lein run :gpu 2` - This will run on 2 gpu devices
diff --git a/contrib/clojure-package/examples/imclassification/project.clj b/contrib/clojure-package/examples/imclassification/project.clj
deleted file mode 100644
index 471c7f760de2..000000000000
--- a/contrib/clojure-package/examples/imclassification/project.clj
+++ /dev/null
@@ -1,24 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(defproject imclassification "0.1.0-SNAPSHOT"
-  :description "Clojure examples for image classification"
-  :plugins [[lein-cljfmt "0.5.7"]]
-  :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "2.0.0-SNAPSHOT"]]
-  :pedantic? :skip
-  :main imclassification.train-mnist)
diff --git a/contrib/clojure-package/examples/imclassification/src/imclassification/train_mnist.clj b/contrib/clojure-package/examples/imclassification/src/imclassification/train_mnist.clj
deleted file mode 100644
index 164b5f2620f2..000000000000
--- a/contrib/clojure-package/examples/imclassification/src/imclassification/train_mnist.clj
+++ /dev/null
@@ -1,121 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns imclassification.train-mnist
-  (:require [clojure.java.io :as io]
-            [clojure.java.shell :refer [sh]]
-            [org.apache.clojure-mxnet.io :as mx-io]
-            [org.apache.clojure-mxnet.context :as context]
-            [org.apache.clojure-mxnet.module :as m]
-            [org.apache.clojure-mxnet.symbol :as sym]
-            [org.apache.clojure-mxnet.kvstore :as kvstore]
-            [org.apache.clojure-mxnet.kvstore-server :as kvstore-server]
-            [org.apache.clojure-mxnet.optimizer :as optimizer]
-            [org.apache.clojure-mxnet.eval-metric :as eval-metric]
-            [org.apache.clojure-mxnet.resource-scope :as resource-scope])
-  (:gen-class))
-
-(def data-dir "data/") ;; the data directory to store the mnist data
-(def batch-size 10) ;; the batch size
-(def optimizer (optimizer/sgd {:learning-rate 0.01 :momentum 0.0}))
-(def eval-metric (eval-metric/accuracy))
-(def num-epoch 1) ;; the number of training epochs
-(def kvstore "local") ;; the kvstore type
-;;; Note to run distributed you might need to complile the engine with an option set
-(def role "worker") ;; scheduler/server/worker
-(def scheduler-host nil) ;; scheduler hostame/ ip address
-(def scheduler-port 0) ;; scheduler port
-(def num-workers 1) ;; # of workers
-(def num-servers 1) ;; # of servers
-
-
-(def envs (cond-> {"DMLC_ROLE" role}
-            scheduler-host (merge {"DMLC_PS_ROOT_URI" scheduler-host
-                                   "DMLC_PS_ROOT_PORT" (str scheduler-port)
-                                   "DMLC_NUM_WORKER" (str num-workers)
-                                   "DMLC_NUM_SERVER" (str num-servers)})))
-
-(when-not (.exists (io/file (str data-dir "train-images-idx3-ubyte")))
-  (sh "../../scripts/get_mnist_data.sh"))
-
-(defn get-symbol []
-  (as-> (sym/variable "data") data
-    (sym/fully-connected "fc1" {:data data :num-hidden 128})
-    (sym/activation "relu1" {:data data :act-type "relu"})
-    (sym/fully-connected "fc2" {:data data :num-hidden 64})
-    (sym/activation "relu2" {:data data :act-type "relu"})
-    (sym/fully-connected "fc3" {:data data :num-hidden 10})
-    (sym/softmax-output "softmax" {:data data})))
-
-
-(defn train-data []
-  (mx-io/mnist-iter {:image (str data-dir "train-images-idx3-ubyte")
-                     :label (str data-dir "train-labels-idx1-ubyte")
-                     :label-name "softmax_label"
-                     :input-shape [784]
-                     :batch-size batch-size
-                     :shuffle true
-                     :flat true
-                     :silent false
-                     :seed 10
-                     :num-parts num-workers
-                     :part-index 0}))
-
-(defn eval-data []
-  (mx-io/mnist-iter {:image (str data-dir "t10k-images-idx3-ubyte")
-                     :label (str data-dir "t10k-labels-idx1-ubyte")
-                     :input-shape [784]
-                     :batch-size batch-size
-                     :flat true
-                     :silent false
-                     :num-parts num-workers
-                     :part-index 0}))
-
-(defn start
-  ([devs] (start devs num-epoch))
-  ([devs _num-epoch]
-  (when scheduler-host
-    (println "Initing PS enviornments with " envs)
-    (kvstore-server/init envs))
-
-  (if (not= "worker" role)
-    (do
-      (println "Start KVStoreServer for scheduler and servers")
-      (kvstore-server/start))
-    (do
-      (println "Starting Training of MNIST ....")
-      (println "Running with context devices of" devs)
-      (resource-scope/with-let [_mod (m/module (get-symbol) {:contexts devs})]
-        (-> _mod
-            (m/fit {:train-data (train-data)
-                    :eval-data (eval-data)
-                    :num-epoch _num-epoch
-                    :fit-params (m/fit-params {:kvstore kvstore
-                                               :optimizer optimizer
-                                               :eval-metric eval-metric})})
-            (m/save-checkpoint {:prefix "target/test" :epoch _num-epoch}))
-        (println "Finish fit"))))))
-
-(defn -main [& args]
-  (let [[dev dev-num] args
-        devs (if (= dev ":gpu")
-               (mapv #(context/gpu %) (range (Integer/parseInt (or dev-num "1"))))
-               (mapv #(context/cpu %) (range (Integer/parseInt (or dev-num "1")))))]
-    (start devs)))
-
-(comment
-  (start [(context/cpu)]))
diff --git a/contrib/clojure-package/examples/imclassification/test/imclassification/train_mnist_test.clj b/contrib/clojure-package/examples/imclassification/test/imclassification/train_mnist_test.clj
deleted file mode 100644
index f185891ab31e..000000000000
--- a/contrib/clojure-package/examples/imclassification/test/imclassification/train_mnist_test.clj
+++ /dev/null
@@ -1,40 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns imclassification.train-mnist-test
-	(:require
-		[clojure.test :refer :all]
-		[clojure.java.io :as io]
-		[clojure.string :as s]
-		[org.apache.clojure-mxnet.context :as context]
-		[org.apache.clojure-mxnet.module :as module]
-		[imclassification.train-mnist :as mnist]))
-
-(defn- file-to-filtered-seq [file]
-	(->>
-		file
-		(io/file)
-		(io/reader)
-		(line-seq)
-		(filter  #(not (s/includes? % "mxnet_version")))))
-
-(deftest mnist-two-epochs-test
-  (do
-    (mnist/start [(context/cpu)] 2)
-    (is (=
-         (file-to-filtered-seq "test/test-symbol.json.ref")
-         (file-to-filtered-seq "target/test-symbol.json")))))
diff --git a/contrib/clojure-package/examples/imclassification/test/test-symbol.json.ref b/contrib/clojure-package/examples/imclassification/test/test-symbol.json.ref
deleted file mode 100644
index ba1d2fad3a8a..000000000000
--- a/contrib/clojure-package/examples/imclassification/test/test-symbol.json.ref
+++ /dev/null
@@ -1,105 +0,0 @@
-{
-  "nodes": [
-    {
-      "op": "null", 
-      "name": "data", 
-      "inputs": []
-    }, 
-    {
-      "op": "null", 
-      "name": "fc1_weight", 
-      "attrs": {"num_hidden": "128"}, 
-      "inputs": []
-    }, 
-    {
-      "op": "null", 
-      "name": "fc1_bias", 
-      "attrs": {"num_hidden": "128"}, 
-      "inputs": []
-    }, 
-    {
-      "op": "FullyConnected", 
-      "name": "fc1", 
-      "attrs": {"num_hidden": "128"}, 
-      "inputs": [[0, 0, 0], [1, 0, 0], [2, 0, 0]]
-    }, 
-    {
-      "op": "Activation", 
-      "name": "relu1", 
-      "attrs": {"act_type": "relu"}, 
-      "inputs": [[3, 0, 0]]
-    }, 
-    {
-      "op": "null", 
-      "name": "fc2_weight", 
-      "attrs": {"num_hidden": "64"}, 
-      "inputs": []
-    }, 
-    {
-      "op": "null", 
-      "name": "fc2_bias", 
-      "attrs": {"num_hidden": "64"}, 
-      "inputs": []
-    }, 
-    {
-      "op": "FullyConnected", 
-      "name": "fc2", 
-      "attrs": {"num_hidden": "64"}, 
-      "inputs": [[4, 0, 0], [5, 0, 0], [6, 0, 0]]
-    }, 
-    {
-      "op": "Activation", 
-      "name": "relu2", 
-      "attrs": {"act_type": "relu"}, 
-      "inputs": [[7, 0, 0]]
-    }, 
-    {
-      "op": "null", 
-      "name": "fc3_weight", 
-      "attrs": {"num_hidden": "10"}, 
-      "inputs": []
-    }, 
-    {
-      "op": "null", 
-      "name": "fc3_bias", 
-      "attrs": {"num_hidden": "10"}, 
-      "inputs": []
-    }, 
-    {
-      "op": "FullyConnected", 
-      "name": "fc3", 
-      "attrs": {"num_hidden": "10"}, 
-      "inputs": [[8, 0, 0], [9, 0, 0], [10, 0, 0]]
-    }, 
-    {
-      "op": "null", 
-      "name": "softmax_label", 
-      "inputs": []
-    }, 
-    {
-      "op": "SoftmaxOutput", 
-      "name": "softmax", 
-      "inputs": [[11, 0, 0], [12, 0, 0]]
-    }
-  ], 
-  "arg_nodes": [0, 1, 2, 5, 6, 9, 10, 12], 
-  "node_row_ptr": [
-    0, 
-    1, 
-    2, 
-    3, 
-    4, 
-    5, 
-    6, 
-    7, 
-    8, 
-    9, 
-    10, 
-    11, 
-    12, 
-    13, 
-    14
-  ], 
-  "heads": [[13, 0, 0]], 
-  "attrs": {"mxnet_version": ["int", 10400]}
-}
\ No newline at end of file
diff --git a/contrib/clojure-package/examples/infer/imageclassifier/.gitignore b/contrib/clojure-package/examples/infer/imageclassifier/.gitignore
deleted file mode 100644
index 35491f1a084a..000000000000
--- a/contrib/clojure-package/examples/infer/imageclassifier/.gitignore
+++ /dev/null
@@ -1,12 +0,0 @@
-/target
-/classes
-/checkouts
-/images
-pom.xml
-pom.xml.asc
-*.jar
-*.class
-/.lein-*
-/.nrepl-port
-.hgignore
-.hg/
diff --git a/contrib/clojure-package/examples/infer/imageclassifier/README.md b/contrib/clojure-package/examples/infer/imageclassifier/README.md
deleted file mode 100644
index ef82feebd53f..000000000000
--- a/contrib/clojure-package/examples/infer/imageclassifier/README.md
+++ /dev/null
@@ -1,41 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# imageclassifier
-
-Run image classification using clojure infer package.
-
-## Installation
-
-Before you run this example, make sure that you have the clojure package installed.
-In the main clojure package directory, do `lein install`. Then you can run
-`lein install` in this directory.
-
-## Usage
-
-```
-$ chmod +x scripts/get_resnet_18_data.sh
-$ ./scripts/get_resnet_18_data.sh
-$
-$ lein run -- --help
-$ lein run -- -m models/resnet-18/resnet-18 -i images/kitten.jpg -d images/
-$
-$ lein uberjar
-$ java -jar target/imageclassifier-0.1.0-SNAPSHOT-standalone.jar --help
-$ java -jar target/imageclassifier-0.1.0-SNAPSHOT-standalone.jar \
-    -m models/resnet-18/resnet-18 -i images/kitten.jpg -d images/
-```
diff --git a/contrib/clojure-package/examples/infer/imageclassifier/project.clj b/contrib/clojure-package/examples/infer/imageclassifier/project.clj
deleted file mode 100644
index f04d76274fa1..000000000000
--- a/contrib/clojure-package/examples/infer/imageclassifier/project.clj
+++ /dev/null
@@ -1,25 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(defproject imageclassifier "0.1.0-SNAPSHOT"
-  :description "Image classification using infer with MXNet"
-  :plugins [[lein-cljfmt "0.5.7"]]
-  :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.clojure/tools.cli "0.4.1"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "2.0.0-SNAPSHOT"]]
-  :main ^:skip-aot infer.imageclassifier-example
-  :profiles {:uberjar {:aot :all}})
diff --git a/contrib/clojure-package/examples/infer/imageclassifier/scripts/get_resnet_18_data.sh b/contrib/clojure-package/examples/infer/imageclassifier/scripts/get_resnet_18_data.sh
deleted file mode 100755
index 1a142e8edbfd..000000000000
--- a/contrib/clojure-package/examples/infer/imageclassifier/scripts/get_resnet_18_data.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -evx
-
-MXNET_ROOT=$(cd "$(dirname $0)/.."; pwd)
-
-data_path=$MXNET_ROOT/models/resnet-18/
-
-image_path=$MXNET_ROOT/images/
-
-if [ ! -d "$data_path" ]; then
-    mkdir -p "$data_path"
-fi
-
-if [ ! -d "$image_path" ]; then
-    mkdir -p "$image_path"
-fi
-
-if [ ! -f "$data_path/resnet-18-0000.params" ]; then
-    wget https://s3.us-east-2.amazonaws.com/scala-infer-models/resnet-18/resnet-18-symbol.json -P $data_path
-    wget https://s3.us-east-2.amazonaws.com/scala-infer-models/resnet-18/resnet-18-0000.params -P $data_path
-    wget https://s3.us-east-2.amazonaws.com/scala-infer-models/resnet-18/synset.txt -P $data_path
-fi
-
-if [ ! -f "$image_path/kitten.jpg" ]; then
-    wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/kitten.jpg -P $image_path
-    wget https://s3.amazonaws.com/model-server/inputs/Pug-Cookie.jpg -P $image_path
-fi
diff --git a/contrib/clojure-package/examples/infer/imageclassifier/scripts/get_resnet_data.sh b/contrib/clojure-package/examples/infer/imageclassifier/scripts/get_resnet_data.sh
deleted file mode 100755
index fcef59bacc6f..000000000000
--- a/contrib/clojure-package/examples/infer/imageclassifier/scripts/get_resnet_data.sh
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-
-MXNET_ROOT=$(cd "$(dirname $0)/.."; pwd)
-
-data_path=$MXNET_ROOT/models/resnet-152/
-
-image_path=$MXNET_ROOT/images/
-
-if [ ! -d "$data_path" ]; then
-  mkdir -p "$data_path"
-fi
-
-if [ ! -d "$image_path" ]; then
-  mkdir -p "$image_path"
-fi
-
-if [ ! -f "$data_path/resnet-152-0000.params" ]; then
-  wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/resnet-152-0000.params -P $data_path
-  wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/resnet-152-symbol.json -P $data_path
-  wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/synset.txt -P $data_path
-fi
-
-if [ ! -f "$image_path/kitten.jpg" ]; then
-  wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/resnet152/kitten.jpg -P $image_path
-fi
diff --git a/contrib/clojure-package/examples/infer/imageclassifier/src/infer/imageclassifier_example.clj b/contrib/clojure-package/examples/infer/imageclassifier/src/infer/imageclassifier_example.clj
deleted file mode 100644
index bc8b82e1ece1..000000000000
--- a/contrib/clojure-package/examples/infer/imageclassifier/src/infer/imageclassifier_example.clj
+++ /dev/null
@@ -1,113 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns infer.imageclassifier-example
-  (:require [org.apache.clojure-mxnet.context :as context]
-            [org.apache.clojure-mxnet.dtype :as dtype]
-            [org.apache.clojure-mxnet.infer :as infer]
-            [org.apache.clojure-mxnet.layout :as layout]
-            [clojure.java.io :as io]
-            [clojure.string :refer [join]]
-            [clojure.tools.cli :refer [parse-opts]])
-  (:gen-class))
-
-(defn check-valid-dir
-  "Check that the input directory exists"
-  [input-dir]
-  (let [dir (io/file input-dir)]
-    (and
-     (.exists dir)
-     (.isDirectory dir))))
-
-(defn check-valid-file
-  "Check that the file exists"
-  [input-file]
-  (.exists (io/file input-file)))
-
-(def cli-options
-  [["-m" "--model-path-prefix PREFIX" "Model path prefix"
-    :default "models/resnet-18/resnet-18"
-    :validate [#(check-valid-file (str % "-symbol.json"))
-               "Model path prefix is invalid"]]
-   ["-i" "--input-image IMAGE" "Input image"
-    :default "images/kitten.jpg"
-    :validate [check-valid-file "Input file not found"]]
-   ["-d" "--input-dir IMAGE_DIR" "Input directory"
-    :default "images/"
-    :validate [check-valid-dir "Input directory not found"]]
-   ["-h" "--help"]])
-
-(defn print-predictions
-  "Print image classifier predictions for the given input file"
-  [predictions]
-  (println (apply str (repeat 80 "=")))
-  (doseq [p predictions]
-    (println p))
-  (println (apply str (repeat 80 "="))))
-
-(defn classify-single-image
-  "Classify a single image and print top-5 predictions"
-  [classifier input-image]
-  (let [image (infer/load-image-from-file input-image)
-        topk 5
-        predictions (infer/classify-image classifier image topk)]
-    [predictions]))
-
-(defn classify-images-in-dir
-  "Classify all jpg images in the directory"
-  [classifier input-dir]
-  (let [batch-size 20
-        image-file-batches (->> input-dir
-                                io/file
-                                file-seq
-                                sort
-                                reverse
-                                (filter #(.isFile %))
-                                (filter #(re-matches #".*\.jpg$" (.getPath %)))
-                                (mapv #(.getPath %))
-                                (partition-all batch-size))]
-    (apply concat (for [image-files image-file-batches]
-                    (let [image-batch (infer/load-image-paths image-files)
-                          topk 5]
-                      (infer/classify-image-batch classifier image-batch topk))))))
-
-(defn run-classifier
-  "Runs an image classifier based on options provided"
-  [options]
-  (let [{:keys [model-path-prefix input-image input-dir]} options
-        descriptors [{:name "data"
-                      :shape [1 3 224 224]
-                      :layout layout/NCHW
-                      :dtype dtype/FLOAT32}]
-        factory (infer/model-factory model-path-prefix descriptors)
-        classifier (infer/create-image-classifier
-                    factory {:contexts [(context/default-context)]})]
-    (println "Classifying a single image")
-    (print-predictions (classify-single-image classifier input-image))
-    (println "\n")
-    (println "Classifying images in a directory")
-    (doseq [predictions (classify-images-in-dir classifier input-dir)]
-      (print-predictions predictions))))
-
-(defn -main
-  [& args]
-  (let [{:keys [options summary errors] :as opts}
-        (parse-opts args cli-options)]
-    (cond
-      (:help options) (println summary)
-      (some? errors) (println (join "\n" errors))
-      :else (run-classifier options))))
diff --git a/contrib/clojure-package/examples/infer/imageclassifier/test/infer/imageclassifier_example_test.clj b/contrib/clojure-package/examples/infer/imageclassifier/test/infer/imageclassifier_example_test.clj
deleted file mode 100644
index 4b71f845dd5f..000000000000
--- a/contrib/clojure-package/examples/infer/imageclassifier/test/infer/imageclassifier_example_test.clj
+++ /dev/null
@@ -1,58 +0,0 @@
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns infer.imageclassifier-example-test
-  (:require [infer.imageclassifier-example :refer [classify-single-image
-                                                   classify-images-in-dir]]
-            [org.apache.clojure-mxnet.context :as context]
-            [org.apache.clojure-mxnet.dtype :as dtype]
-            [org.apache.clojure-mxnet.infer :as infer]
-            [org.apache.clojure-mxnet.layout :as layout]
-            [clojure.java.io :as io]
-            [clojure.java.shell :refer [sh]]
-            [clojure.test :refer :all]))
-
-(def model-dir "models/")
-(def image-dir "images/")
-(def model-path-prefix (str model-dir "resnet-18/resnet-18"))
-(def image-file (str image-dir "kitten.jpg"))
-
-(when-not (.exists (io/file (str model-path-prefix "-symbol.json")))
-  (sh "./scripts/get_resnet_18_data.sh"))
-
-(defn create-classifier []
-  (let [descriptors [{:name "data"
-                      :shape [1 3 224 224]
-                      :layout layout/NCHW
-                      :dtype dtype/FLOAT32}]
-        factory (infer/model-factory model-path-prefix descriptors)]
-    (infer/create-image-classifier factory)))
-
-(deftest test-single-classification
-  (let [classifier (create-classifier)
-        [[predictions]] (classify-single-image classifier image-file)]
-    (is (some? predictions))
-    (is (= 5 (count predictions)))
-    (is (= "n02123159 tiger cat" (:class (first predictions))))
-    (is (= (< 0 (:prob (first predictions)) 1)))))
-
-(deftest test-batch-classification
-  (let [classifier (create-classifier)
-        predictions (first (classify-images-in-dir classifier image-dir))]
-    (is (some? predictions))
-    (is (= 5 (count predictions)))
-    (is (= "n02123159 tiger cat" (:class (first predictions))))
-    (is (= (< 0 (:prob (first predictions)) 1)))))
diff --git a/contrib/clojure-package/examples/infer/predictor/test/infer/predictor_example_test.clj b/contrib/clojure-package/examples/infer/predictor/test/infer/predictor_example_test.clj
deleted file mode 100644
index 02f826fbb77f..000000000000
--- a/contrib/clojure-package/examples/infer/predictor/test/infer/predictor_example_test.clj
+++ /dev/null
@@ -1,51 +0,0 @@
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns infer.predictor-example-test
-  (:require [infer.predictor-example :refer [preprocess
-                                             do-inference
-                                             postprocess]]
-            [org.apache.clojure-mxnet.context :as context]
-            [org.apache.clojure-mxnet.dtype :as dtype]
-            [org.apache.clojure-mxnet.infer :as infer]
-            [org.apache.clojure-mxnet.layout :as layout]
-            [clojure.java.io :as io]
-            [clojure.java.shell :refer [sh]]
-            [clojure.test :refer :all]))
-
-(def model-dir "models/")
-(def image-file "images/kitten.jpg")
-(def model-path-prefix (str model-dir "resnet-18/resnet-18"))
-(def width 224)
-(def height 224)
-
-(when-not (.exists (io/file (str model-path-prefix "-symbol.json")))
-  (sh "./scripts/get_resnet_18_data.sh"))
-
-(defn create-predictor []
-  (let [descriptors [{:name "data"
-                      :shape [1 3 height width]
-                      :layout layout/NCHW
-                      :dtype dtype/FLOAT32}]
-        factory (infer/model-factory model-path-prefix descriptors)]
-    (infer/create-predictor factory)))
-
-(deftest predictor-test
-  (let [predictor (create-predictor)
-        image-ndarray (preprocess image-file width height)
-        predictions (do-inference predictor image-ndarray)
-        best-prediction (postprocess model-path-prefix predictions)]
-    (is (= "n02123159 tiger cat" best-prediction))))
diff --git a/contrib/clojure-package/examples/module/README.md b/contrib/clojure-package/examples/module/README.md
deleted file mode 100644
index fcd16b74a173..000000000000
--- a/contrib/clojure-package/examples/module/README.md
+++ /dev/null
@@ -1,38 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-## Instructions
-
-This shows off how to use the module api.
-
-There are examples of:
- - high level api of training and prediction
- - intermediate level api with save and loading from checkpoints
- - examples of how to iteratate through the batch and calculate accuracy and predict manually.
-
-To run the example you must do
-
-* `lein install` in the root of the main project directory
-* cd into this project directory and do `lein run`. This will execute the cpu version.
-
-You can control the devices you run on by doing:
-
-`lein run :cpu 2` - This will run on 2 cpu devices
-`lein run :gpu 1` - This will run on 1 gpu device
-`lein run :gpu 2` - This will run on 2 gpu devices
-
-
diff --git a/contrib/clojure-package/examples/module/project.clj b/contrib/clojure-package/examples/module/project.clj
deleted file mode 100644
index 04e7fa140a06..000000000000
--- a/contrib/clojure-package/examples/module/project.clj
+++ /dev/null
@@ -1,25 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(defproject module-examples "0.1.0-SNAPSHOT"
-  :description "Clojure examples for module"
-  :plugins [[lein-cljfmt "0.5.7"]]
-  :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "2.0.0-SNAPSHOT"]]
-  :pedantic? :skip
-  :main mnist-mlp)
-
diff --git a/contrib/clojure-package/examples/module/src/mnist_mlp.clj b/contrib/clojure-package/examples/module/src/mnist_mlp.clj
deleted file mode 100644
index c5ffbbede852..000000000000
--- a/contrib/clojure-package/examples/module/src/mnist_mlp.clj
+++ /dev/null
@@ -1,237 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns mnist-mlp
-  (:require [clojure.java.io :as io]
-            [clojure.java.shell :refer [sh]]
-            [org.apache.clojure-mxnet.context :as context]
-            [org.apache.clojure-mxnet.eval-metric :as eval-metric]
-            [org.apache.clojure-mxnet.io :as mx-io]
-            [org.apache.clojure-mxnet.module :as m]
-            [org.apache.clojure-mxnet.ndarray :as ndarray]
-            [org.apache.clojure-mxnet.optimizer :as optimizer]
-            [org.apache.clojure-mxnet.symbol :as sym]
-            [org.apache.clojure-mxnet.util :as util]
-            [org.apache.clojure-mxnet.ndarray :as ndarray])
-  (:gen-class))
-
-(def data-dir "data/")
-(def batch-size 10)
-(def num-epoch 5)
-
-(when-not (.exists (io/file (str data-dir "train-images-idx3-ubyte")))
-  (sh "../../scripts/get_mnist_data.sh"))
-;; for save checkpoints load checkpoints
-(io/make-parents "model/dummy.txt")
-
-;;; Load the MNIST datasets
-(defonce train-data (mx-io/mnist-iter {:image (str data-dir "train-images-idx3-ubyte")
-                                       :label (str data-dir "train-labels-idx1-ubyte")
-                                       :label-name "softmax_label"
-                                       :input-shape [784]
-                                       :batch-size batch-size
-                                       :shuffle true
-                                       :flat true
-                                       :silent false
-                                       :seed 10}))
-
-(defonce test-data (mx-io/mnist-iter {:image (str data-dir "t10k-images-idx3-ubyte")
-                                      :label (str data-dir "t10k-labels-idx1-ubyte")
-                                      :input-shape [784]
-                                      :batch-size batch-size
-                                      :flat true
-                                      :silent false}))
-(defn get-symbol []
-  (as-> (sym/variable "data") data
-    (sym/fully-connected "fc1" {:data data :num-hidden 128})
-    (sym/activation "relu1" {:data data :act-type "relu"})
-    (sym/fully-connected "fc2" {:data data :num-hidden 64})
-    (sym/activation "relu2" {:data data :act-type "relu"})
-    (sym/fully-connected "fc3" {:data data :num-hidden 10})
-    (sym/softmax-output "softmax" {:data data})))
-
-(defn- print-header [message]
-  (println "")
-  (println "=================")
-  (println (str "  " message))
-  (println "=================")
-  (println ""))
-
-(defn run-intermediate-level-api [& {:keys [devs load-model-epoch]}]
-
-  (let [header "Running Intermediate Level API"]
-    (print-header (if load-model-epoch (str header " and loading from previous epoch " load-model-epoch)
-                      header)))
-
-  (let [save-prefix "model/mnist-mlp"
-        mod (if load-model-epoch
-              (do
-                (println "Loading from checkpoint of epoch " load-model-epoch)
-                (m/load-checkpoint {:contexts devs :prefix save-prefix :epoch load-model-epoch}))
-              (m/module (get-symbol) {:contexts devs}))
-        metric (eval-metric/accuracy)]
-    (-> mod
-        (m/bind {:data-shapes (mx-io/provide-data-desc train-data) :label-shapes (mx-io/provide-label-desc train-data)})
-        (m/init-params)
-        (m/init-optimizer {:optimizer (optimizer/sgd {:learning-rate 0.01 :momentum 0.9})}))
-
-    (doseq [epoch-num (range num-epoch)]
-      (println "starting epoch " epoch-num)
-      (mx-io/do-batches
-       train-data
-       (fn [batch]
-         (-> mod
-             (m/forward batch)
-             (m/update-metric metric (mx-io/batch-label batch))
-             (m/backward)
-             (m/update))))
-      (println "result for epoch " epoch-num " is " (eval-metric/get-and-reset metric))
-      (m/save-checkpoint mod {:prefix save-prefix :epoch epoch-num :save-opt-states true}))))
-
-(defn run-high-level-api [devs]
-  (print-header "Running High Level API")
-
-  (let [mod (m/module (get-symbol) {:contexts devs})]
-    ;;; note only one function for training
-    (m/fit mod {:train-data train-data :eval-data test-data :num-epoch num-epoch})
-
-    ;;high level predict (just a dummy call but it returns a vector of results
-    (m/predict mod {:eval-data test-data})
-
-    ;;;high level score (returs the eval values)
-    (let [score (m/score mod {:eval-data test-data :eval-metric (eval-metric/accuracy)})]
-      (println "High level predict score is " score))))
-
-(defn run-predication-and-calc-accuracy-manually [devs]
-  ;;; Gathers all the predictions at once with `predict-every-batch`
-  ;;; then cycles thorugh the batches and manually calculates the accuracy stats
-
-  (print-header "Running Predicting and Calcing the Accuracy Manually")
-
-  (let [mod (m/module (get-symbol) {:contexts devs})]
-    ;;; note only one function for training
-    (m/fit mod {:train-data train-data :eval-data test-data :num-epoch num-epoch})
-    (let [preds (m/predict-every-batch mod {:eval-data test-data})
-          stats (mx-io/reduce-batches test-data
-                                      (fn [r b]
-                                        (let [pred-label (->> (ndarray/argmax-channel (first (get preds (:index r))))
-                                                              (ndarray/->vec)
-                                                              (mapv int))
-                                              label (->> (mx-io/batch-label b)
-                                                         (first)
-                                                         (ndarray/->vec)
-                                                         (mapv int))
-                                              acc-sum (apply + (mapv (fn [pl l] (if (= pl l) 1 0))
-                                                                     pred-label label))]
-                                          (-> r
-                                              (update :index inc)
-                                              (update :acc-cnt (fn [v] (+ v (count pred-label))))
-                                              (update :acc-sum (fn [v] (+ v
-                                                                          (apply + (mapv (fn [pl l] (if (= pl l) 1 0))
-                                                                                         pred-label label))))))))
-                                      {:acc-sum 0 :acc-cnt 0 :index 0})]
-      (println "Stats: " stats)
-      (println "Accuracy: " (/ (:acc-sum stats)
-                               (* 1.0 (:acc-cnt stats)))))))
-
-(defn run-prediction-iterator-api [devs]
-  ;;Cycles through all the batchs and manually predicts and prints out the accuracy
-  ;;using `predict-batch`
-
-  (print-header "Running the Prediction Iterator API and Calcing the Accuracy Manually")
-
-  (let [mod (m/module (get-symbol) {:contexts devs})]
-    ;;; note only one function for training
-    (m/fit mod {:train-data train-data :eval-data test-data :num-epoch num-epoch})
-    (mx-io/reduce-batches test-data
-                          (fn [r b]
-                            (let [preds (m/predict-batch mod b)
-                                  pred-label (->> (ndarray/argmax-channel (first preds))
-                                                  (ndarray/->vec)
-                                                  (mapv int))
-                                  label (->> (mx-io/batch-label b)
-                                             (first)
-                                             (ndarray/->vec)
-                                             (mapv int))
-                                  acc (/ (apply + (mapv (fn [pl l] (if (= pl l) 1 0)) pred-label label))
-                                         (* 1.0 (count pred-label)))]
-                              (println "Batch " r " acc: " acc)
-                              (inc r))))))
-
-(defn run-all [devs]
-  (run-intermediate-level-api :devs devs)
-  (run-intermediate-level-api :devs devs :load-model-epoch (dec num-epoch))
-  (run-high-level-api devs)
-  (run-prediction-iterator-api devs)
-  (run-predication-and-calc-accuracy-manually devs))
-
-(defn -main
-  [& args]
-  (let [[dev dev-num] args
-        devs (if (= dev ":gpu")
-               (mapv #(context/gpu %) (range (Integer/parseInt (or dev-num "1"))))
-               (mapv #(context/cpu %) (range (Integer/parseInt (or dev-num "1")))))]
-    (println "Running Module MNIST example")
-    (println "Running with context devices of" devs)
-    (run-all devs)))
-
-(comment
-
-  ;;; run all the example functions
-  (run-all [(context/cpu)])
-
-  ;;; run for the number of epochs
-  (run-intermediate-level-api :devs [(context/cpu)])
-  ;;=> starting epoch  0
-  ;;=> result for epoch  0  is  [accuracy 0.8531333]
-  ;;=> INFO  ml.dmlc.mxnet.module.Module: Saved checkpoint to model/mnist-mlp-0000.params
-  ;;=> INFO  ml.dmlc.mxnet.module.Module: Saved optimizer state to model/mnist-mlp-0000.states
-  ;;=> ....
-  ;;=> starting epoch  4
-  ;;=> result for epoch  4  is  [accuracy 0.91875]
-  ;;=> INFO  ml.dmlc.mxnet.module.Module: Saved checkpoint to model/mnist-mlp-0004.params
-  ;;=> INFO  ml.dmlc.mxnet.module.Module: Saved optimizer state to model/mnist-mlp-0004.states
-
-
-  ;; load from the last saved file and run again
-  (run-intermediate-level-api :devs [(context/cpu)] :load-model-epoch (dec num-epoch))
-  ;;=> Loading from checkpoint of epoch  4
-  ;;=> starting epoch  0
-  ;;=> result for epoch  0  is  [accuracy 0.96258336]
-  ;;=> INFO  ml.dmlc.mxnet.module.Module: Saved checkpoint to model/mnist-mlp-0000.params
-  ;;=> INFO  ml.dmlc.mxnet.module.Module: Saved optimizer state to model/mnist-mlp-0000.states
-  ;;=> ...
-  ;;=> starting epoch  4
-  ;;=> result for epoch  4  is  [accuracy 0.9819833]
-  ;;=> INFO  ml.dmlc.mxnet.module.Module: Saved checkpoint to model/mnist-mlp-0004.params
-  ;;=> INFO  ml.dmlc.mxnet.module.Module: Saved optimizer state to model/mnist-mlp-0004.states
-
-  (run-high-level-api [(context/cpu)])
-  ;;=> ["accuracy" 0.9454]
-
-  (run-prediction-iterator-api [(context/cpu)])
-  ;;=> Batch  0  acc:  1.0
-  ;;=> Batch  1  acc:  0.9
-  ;;=> Batch  2  acc:  1.0
-  ;;=> ...
-  ;;=> Batch  999  acc:  1.0
-
-  (run-predication-and-calc-accuracy-manually [(context/cpu)])
-  ;;=> Stats:  {:acc-sum 9494, :acc-cnt 10000, :index 1000}
-  ;;=> Accuracy:  0.9494
-)
-
diff --git a/contrib/clojure-package/examples/module/test/mnist_mlp_test.clj b/contrib/clojure-package/examples/module/test/mnist_mlp_test.clj
deleted file mode 100644
index 5fbcdd3c0b39..000000000000
--- a/contrib/clojure-package/examples/module/test/mnist_mlp_test.clj
+++ /dev/null
@@ -1,29 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-(ns mnist-mlp-test
-	(:require 
-		[mnist-mlp :refer :all]
-		[org.apache.clojure-mxnet.context :as context]
-		[clojure.test :refer :all]))
-
-(deftest run-those-tests
-	(let [devs [(context/cpu)]]
-	  (run-intermediate-level-api :devs devs)
-	  (run-intermediate-level-api :devs devs :load-model-epoch (dec num-epoch))
-	  (run-high-level-api devs)
-	  (run-prediction-iterator-api devs)
-	  (run-predication-and-calc-accuracy-manually devs)))
\ No newline at end of file
diff --git a/contrib/clojure-package/examples/multi-label/.gitignore b/contrib/clojure-package/examples/multi-label/.gitignore
deleted file mode 100644
index c53038ec0e3d..000000000000
--- a/contrib/clojure-package/examples/multi-label/.gitignore
+++ /dev/null
@@ -1,11 +0,0 @@
-/target
-/classes
-/checkouts
-pom.xml
-pom.xml.asc
-*.jar
-*.class
-/.lein-*
-/.nrepl-port
-.hgignore
-.hg/
diff --git a/contrib/clojure-package/examples/multi-label/README.md b/contrib/clojure-package/examples/multi-label/README.md
deleted file mode 100644
index a2ea7e01e686..000000000000
--- a/contrib/clojure-package/examples/multi-label/README.md
+++ /dev/null
@@ -1,44 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# multi-label
-
-This is a quick example of doing multi-label classification.
-It involves using a proxy to implement the DataIter to make a custom
-data iterator for MNIST
-
-
-## Installation
-
-Before you run this example, make sure that you have the clojure package installed.
-In the main clojure package directory, do `lein install`. Then you can run
-`lein install` in this directory.
-
-## Usage
-
-To run
-`lein run`. This will execute the cpu version.
-
-You can control the devices you run on by doing:
-
-`lein run :cpu` - This will run on 1 cpu device
-`lein run :gpu` - This will run on 1 gpu device
-
-This example only works on 1 device
-
-
-
diff --git a/contrib/clojure-package/examples/multi-label/project.clj b/contrib/clojure-package/examples/multi-label/project.clj
deleted file mode 100644
index c8573bfc2e94..000000000000
--- a/contrib/clojure-package/examples/multi-label/project.clj
+++ /dev/null
@@ -1,23 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(defproject multi-label "0.1.0-SNAPSHOT"
-  :description "Example of multi-label classification"
-  :plugins [[lein-cljfmt "0.5.7"]]
-  :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "2.0.0-SNAPSHOT"]]
-  :main multi-label.core)
diff --git a/contrib/clojure-package/examples/multi-label/src/multi_label/core.clj b/contrib/clojure-package/examples/multi-label/src/multi_label/core.clj
deleted file mode 100644
index e96783daad98..000000000000
--- a/contrib/clojure-package/examples/multi-label/src/multi_label/core.clj
+++ /dev/null
@@ -1,167 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns multi-label.core
-  (:require [clojure.java.io :as io]
-            [clojure.java.shell :refer [sh]]
-            [org.apache.clojure-mxnet.eval-metric :as eval-metric]
-            [org.apache.clojure-mxnet.io :as mx-io]
-            [org.apache.clojure-mxnet.module :as m]
-            [org.apache.clojure-mxnet.ndarray :as ndarray]
-            [org.apache.clojure-mxnet.symbol :as sym]
-            [org.apache.clojure-mxnet.shape :as mx-shape]
-            [org.apache.clojure-mxnet.util :as util]
-            [org.apache.clojure-mxnet.context :as context])
-  (:import (org.apache.mxnet DataIter)
-           (java.util NoSuchElementException))
-  (:gen-class))
-
-(def data-dir "data/")
-(def batch-size 100)
-(def num-epoch 1)
-
-(when-not (.exists (io/file (str data-dir "train-images-idx3-ubyte")))
-  (sh "../../scripts/get_mnist_data.sh"))
-
-;;; Load the MNIST datasets
-(defonce train-data (mx-io/mnist-iter {:image (str data-dir "train-images-idx3-ubyte")
-                                       :label (str data-dir "train-labels-idx1-ubyte")
-                                       :label-name "softmax_label"
-                                       :input-shape [784]
-                                       :batch-size batch-size
-                                       :shuffle true
-                                       :flat true
-                                       :silent false
-                                       :seed 10}))
-
-(defonce test-data (mx-io/mnist-iter {:image (str data-dir "t10k-images-idx3-ubyte")
-                                      :label (str data-dir "t10k-labels-idx1-ubyte")
-                                      :input-shape [784]
-                                      :batch-size batch-size
-                                      :flat true
-                                      :silent false}))
-(defn build-network []
-  (let [fc3 (as-> (sym/variable "data") data
-              (sym/fully-connected "fc1" {:data data :num-hidden 128})
-              (sym/activation "relu1" {:data data :act-type "relu"})
-              (sym/fully-connected "fc2" {:data data :num-hidden 64})
-              (sym/activation "relu2" {:data data :act-type "relu"})
-              (sym/fully-connected "fc3" {:data data :num-hidden 10}))
-        sm1 (sym/softmax-output "softmax1" {:data fc3})
-        sm2 (sym/softmax-output "softmax2" {:data fc3})]
-    (sym/group [sm1 sm2])))
-
-;;; provide an override proxy to the DataIter Scala class
-(def multi-train-data (let [data-iter train-data]
-                        (proxy [DataIter] []
-                          (hasNext []
-                            (mx-io/has-next? data-iter))
-                          (next []
-                            (if (mx-io/has-next? data-iter)
-                              (let [batch (mx-io/next data-iter)]
-                                (mx-io/data-batch {:data (util/scala-vector->vec
-                                                          (.getData data-iter))
-                                                   :label (let [label (first
-                                                                       (util/scala-vector->vec (.getLabel data-iter)))]
-                                                            [label label])
-                                                   :index (util/scala-vector->vec
-                                                           (.getIndex data-iter))
-                                                   :pad (.pad batch)}))
-                              (throw (new NoSuchElementException))))
-                          (reset []
-                            (mx-io/reset data-iter))
-                          (batchSize []
-                            (.batchSize data-iter))
-                          (getData []
-                            (.getData data-iter))
-                          (getLabel []
-                            (let [label (first  (util/scala-vector->vec (.getLabel data-iter)))]                              (util/vec->indexed-seq [label label])))
-                          (getIndex []
-                            (.getIndex data-iter))
-                          (getPad []
-                            (.getPad data-iter))
-                          (provideLabel []
-                            (let [shape (->> (mx-io/provide-label data-iter)
-                                             (first)
-                                             (vals)
-                                             last)]
-                              (util/list-map
-                               {"softmax1_label" (mx-shape/->shape shape)
-                                "softmax2_label" (mx-shape/->shape shape)})))
-                          (provideData []
-                            (.provideData data-iter)))))
-
-(defn train [devs]
-  (let [network (build-network)
-        data-and-labels     (->> (into (mx-io/provide-data multi-train-data)
-                                       (mx-io/provide-label multi-train-data))
-                                 (mapcat vals)
-                                 (apply hash-map))
-        [arg-shapes output-shapes aux-shapes] (sym/infer-shape network data-and-labels)
-        arg-names (sym/list-arguments network)
-        aux-names (sym/list-auxiliary-states network)
-        arg-params (zipmap arg-names (mapv #(ndarray/empty %) arg-shapes))
-        aux-params (zipmap aux-names (mapv #(ndarray/empty %) aux-shapes))
-        metric (eval-metric/custom-metric
-                (fn [labels preds]
-                  (println "Carin labels " labels)
-                  (println "Carin preds " preds)
-                  (float 0.5))
-                "multi-accuracy")
-        mod (-> (m/module network {:contexts devs})
-                (m/bind {:data-shapes (mx-io/provide-data multi-train-data)
-                         :label-shapes (mx-io/provide-label multi-train-data)})
-                (m/init-params {:arg-params arg-params :aux-params aux-params})
-                (m/init-optimizer))]
-    (doseq [i (range 1)]
-      (println "Doing epoch " i)
-      (let [acc  (mx-io/reduce-batches
-                  multi-train-data
-                  (fn [r b]
-                    (let [labels (mx-io/batch-label b)
-                          preds (-> (m/forward mod b)
-                                    (m/outputs))
-                          accs (mapv (fn [p l]
-                                       (let [pred-label (->> (ndarray/argmax-channel (first p))
-                                                             (ndarray/->vec)
-                                                             (mapv int))
-                                             label (->> (ndarray/->vec l)
-                                                        (mapv int))]
-                                         (* 1.0 (apply + (mapv (fn [pl l] (if (= pl l) 1 0))
-                                                               pred-label label)))))
-                                     preds labels)]
-                      (-> mod
-                          (m/backward)
-                          (m/update))
-                      (-> r
-                          (update :sum #(mapv (fn [o n] (+ o n)) % accs))
-                          (update :batch-num inc))))
-                  {:sum [0 0] :batch-num 0})]
-        (println "Multi-accuracy " acc)
-        (println "Multi-accuracy " (mapv #(/ % (:batch-num acc)) (:sum acc)))))))
-
-(defn -main [& args]
-  (let [[dev dev-num] args
-        devs (if (= dev ":gpu")
-               (mapv #(context/gpu %) (range (Integer/parseInt (or dev-num "1"))))
-               (mapv #(context/cpu %) (range (Integer/parseInt (or dev-num "1")))))]
-    (println "Training...")
-    (println "Running with context devices of" devs)
-    (train devs)))
-
-(comment
-  (train [(context/cpu)]))
diff --git a/contrib/clojure-package/examples/multi-label/test/multi_label_test.clj b/contrib/clojure-package/examples/multi-label/test/multi_label_test.clj
deleted file mode 100644
index 446a84626e72..000000000000
--- a/contrib/clojure-package/examples/multi-label/test/multi_label_test.clj
+++ /dev/null
@@ -1,26 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns multi_label_test
- (:require 
- 	[multi-label.core :as label]
- 	[clojure.java.io :as io]
- 	[org.apache.clojure-mxnet.context :as context]
- 	[clojure.test :refer :all]))
-
-(deftest run-multi-label
-	(label/train [(context/cpu)]))
\ No newline at end of file
diff --git a/contrib/clojure-package/examples/pre-trained-models/src/pre_trained_models/fine_tune.clj b/contrib/clojure-package/examples/pre-trained-models/src/pre_trained_models/fine_tune.clj
deleted file mode 100644
index 93c121f9fc16..000000000000
--- a/contrib/clojure-package/examples/pre-trained-models/src/pre_trained_models/fine_tune.clj
+++ /dev/null
@@ -1,127 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns pre-trained-models.fine-tune
-  (:require [clojure.string :as string]
-            [org.apache.clojure-mxnet.callback :as callback]
-            [org.apache.clojure-mxnet.context :as context]
-            [org.apache.clojure-mxnet.initializer :as init]
-            [org.apache.clojure-mxnet.io :as mx-io]
-            [org.apache.clojure-mxnet.module :as m]
-            [org.apache.clojure-mxnet.ndarray :as ndarray]
-            [org.apache.clojure-mxnet.symbol :as sym])
-  (:gen-class))
-
-;;; From the finetune example https://mxnet.incubator.apache.org/faq/finetune.html
-
-;; run download-resnet-50.sh to get the model params and json
-;; and download-caltech.sh to get the pregenerated rec files
-
-(def model-dir "model")
-(def batch-size 16)
-
-;;; image set is http://www.vision.caltech.edu/Image_Datasets/Caltech101/
-;; Pictures of objects belonging to 101 categories. About 40 to 800 images per category. Most categories have about 50 images
-
-(def train-iter (mx-io/image-record-iter
-                 {:path-imgrec "caltech-256/caltech-256-60-train.rec"
-                  :data-name "data"
-                  :label-name "softmax_label"
-                  :batch-size batch-size
-                  :data-shape [3 224 224]
-                  :shuffle true
-                  :rand-crop true
-                  :rand-mirror true}))
-
-(def val-iter (mx-io/image-record-iter
-               {:path-imgrec "caltech-256/caltech-256-60-val.rec"
-                :data-name "data"
-                :label-name "softmax_label"
-                :batch-size batch-size
-                :data-shape [3 224 224]
-                :rand-crop false
-                :rand-mirror false}))
-
-(defn get-model []
-  (let [mod (m/load-checkpoint {:prefix (str model-dir "/resnet-50") :epoch 0})]
-    {:msymbol (m/symbol mod)
-     :arg-params (m/arg-params mod)
-     :aux-params (m/aux-params mod)}))
-
-(defn get-fine-tune-model
-  "msymbol: the pretrained network symbol
-    arg-params: the argument parameters of the pretrained model
-    num-classes: the number of classes for the fine-tune datasets
-    layer-name: the layer name before the last fully-connected layer"
-  [{:keys [msymbol arg-params num-classes layer-name]
-    :or {layer-name "flatten0"}}]
-  (let [all-layers (sym/get-internals msymbol)
-        net (sym/get all-layers (str layer-name "_output"))]
-    {:net (as-> net data
-            (sym/fully-connected "fc1" {:data data :num-hidden num-classes})
-            (sym/softmax-output "softmax" {:data data}))
-     :new-args   (->> arg-params
-                      (remove (fn [[k v]] (string/includes? k "fc1")))
-                      (into {}))}))
-
-(defn fit [devs msymbol arg-params aux-params]
-  (let [mod (-> (m/module msymbol {:contexts devs})
-                (m/bind {:data-shapes (mx-io/provide-data-desc train-iter) :label-shapes (mx-io/provide-label-desc val-iter)})
-                (m/init-params {:arg-params arg-params :aux-params aux-params
-                                :allow-missing true}))]
-    (m/fit mod
-           {:train-data train-iter
-            :eval-data val-iter
-            :num-epoch 1
-            :fit-params (m/fit-params {:intializer (init/xavier {:rand-type "gaussian"
-                                                                 :factor-type "in"
-                                                                 :magnitude 2})
-                                       :batch-end-callback (callback/speedometer batch-size 10)})})))
-
-(defn fine-tune! [devs]
-  (let [{:keys [msymbol arg-params aux-params] :as model} (get-model)
-        {:keys [net new-args]} (get-fine-tune-model (merge model {:num-classes 256}))]
-    (fit devs net new-args arg-params)))
-
-(defn -main [& args]
-  (let [[dev dev-num] args
-        devs (if (= dev ":gpu")
-               (mapv #(context/gpu %) (range (Integer/parseInt (or dev-num "1"))))
-               (mapv #(context/cpu %) (range (Integer/parseInt (or dev-num "1")))))]
-    (println "Running with context devices of" devs)
-    (fine-tune! devs)))
-
-(comment
-
-  (fine-tune! [(context/cpu)])
-
-;INFO  ml.dmlc.mxnet.Callback$Speedometer: Epoch[0] Batch [10]	Speed: 3.61 samples/sec	Train-accuracy=0.000000
-;; INFO  ml.dmlc.mxnet.Callback$Speedometer: Epoch[0] Batch [20]	Speed: 3.49 samples/sec	Train-accuracy=0.005952
-;; INFO  ml.dmlc.mxnet.Callback$Speedometer: Epoch[0] Batch [30]	Speed: 3.58 samples/sec	Train-accuracy=0.012097
-;; INFO  ml.dmlc.mxnet.Callback$Speedometer: Epoch[0] Batch [40]	Speed: 3.49 samples/sec	Train-accuracy=0.013720
-;; INFO  ml.dmlc.mxnet.Callback$Speedometer: Epoch[0] Batch [50]	Speed: 3.51 samples/sec	Train-accuracy=0.017157
-;; INFO  ml.dmlc.mxnet.Callback$Speedometer: Epoch[0] Batch [60]	Speed: 3.56 samples/sec	Train-accuracy=0.017418
-;; INFO  ml.dmlc.mxnet.Callback$Speedometer: Epoch[0] Batch [70]	Speed: 3.56 samples/sec	Train-accuracy=0.023768
-;; INFO  ml.dmlc.mxnet.Callback$Speedometer: Epoch[0] Batch [80]	Speed: 3.10 samples/sec	Train-accuracy=0.024691
-;; INFO  ml.dmlc.mxnet.Callback$Speedometer: Epoch[0] Batch [90]	Speed: 3.27 samples/sec	Train-accuracy=0.028846
-;; INFO  ml.dmlc.mxnet.Callback$Speedometer: Epoch[0] Batch [100]	Speed: 3.42 samples/sec	Train-accuracy=0.033416
-;; INFO  ml.dmlc.mxnet.Callback$Speedometer: Epoch[0] Batch [110]	Speed: 3.46 samples/sec	Train-accuracy=0.034910
-;; INFO  ml.dmlc.mxnet.Callback$Speedometer: Epoch[0] Batch [120]	Speed: 3.44 samples/sec	Train-accuracy=0.040806
-;; INFO  ml.dmlc.mxnet.Callback$Speedometer: Epoch[0] Batch [130]	Speed: 3.41 samples/sec	Train-accuracy=0.043893
-;; INFO  ml.dmlc.mxnet.Callback$Speedometer: Epoch[0] Batch [140]	Speed: 3.42 samples/sec	Train-accuracy=0.045213
-)
-
diff --git a/contrib/clojure-package/examples/rnn/.gitignore b/contrib/clojure-package/examples/rnn/.gitignore
deleted file mode 100644
index c53038ec0e3d..000000000000
--- a/contrib/clojure-package/examples/rnn/.gitignore
+++ /dev/null
@@ -1,11 +0,0 @@
-/target
-/classes
-/checkouts
-pom.xml
-pom.xml.asc
-*.jar
-*.class
-/.lein-*
-/.nrepl-port
-.hgignore
-.hg/
diff --git a/contrib/clojure-package/examples/rnn/README.md b/contrib/clojure-package/examples/rnn/README.md
deleted file mode 100644
index 1cedc179612a..000000000000
--- a/contrib/clojure-package/examples/rnn/README.md
+++ /dev/null
@@ -1,44 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# rnn
-
-
-Demonstration of LSTM RNN trainined using Obamas text
-
-## Installation
-
-Before you run this example, make sure that you have the clojure package installed.
-In the main clojure package directory, do `lein install`. Then you can run
-`lein install` in this directory.
-
-
-## Usage
-
-
-run `./get_data.sh to download the training corpus as well as pretrained model.
-
-Run `lein run` to start training the corpus from scratch for 2 epochs and then
-show the result of training after 75 epochs (cpu)
-
-You can control the devices you run on by doing:
-
-`lein run :cpu 2` - This will run on 2 cpu devices
-`lein run :gpu 1` - This will run on 1 gpu device
-`lein run :gpu 2` - This will run on 2 gpu devices
-
-
diff --git a/contrib/clojure-package/examples/rnn/get_data.sh b/contrib/clojure-package/examples/rnn/get_data.sh
deleted file mode 100755
index 4e4a2dc3e4a1..000000000000
--- a/contrib/clojure-package/examples/rnn/get_data.sh
+++ /dev/null
@@ -1,26 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -evx
-
-mkdir data
-cd data
-wget http://data.mxnet.io/mxnet/data/char_lstm.zip
-unzip char_lstm.zip
-cd ..
diff --git a/contrib/clojure-package/examples/rnn/project.clj b/contrib/clojure-package/examples/rnn/project.clj
deleted file mode 100644
index 40c61eeedede..000000000000
--- a/contrib/clojure-package/examples/rnn/project.clj
+++ /dev/null
@@ -1,23 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(defproject rnn "0.1.0-SNAPSHOT"
-  :description "RNN example"
-  :plugins [[lein-cljfmt "0.5.7"]]
-  :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "2.0.0-SNAPSHOT"]]
-  :main rnn.train-char-rnn)
diff --git a/contrib/clojure-package/examples/rnn/src/rnn/lstm.clj b/contrib/clojure-package/examples/rnn/src/rnn/lstm.clj
deleted file mode 100644
index fb3a8f352dee..000000000000
--- a/contrib/clojure-package/examples/rnn/src/rnn/lstm.clj
+++ /dev/null
@@ -1,192 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns rnn.lstm
-  (:require [org.apache.clojure-mxnet.context :as context]
-            [org.apache.clojure-mxnet.executor :as executor]
-            [org.apache.clojure-mxnet.ndarray :as ndarray]
-            [org.apache.clojure-mxnet.symbol :as sym]))
-
-(defn lstm-param [i2h-weight i2h-bias
-                  h2h-weight h2h-bias]
-  {:i2h-weight i2h-weight :i2h-bias i2h-bias
-   :h2h-weight h2h-weight :h2h-bias h2h-bias})
-
-(defn lstm-state [c h]
-  {:c c :h h})
-
-(defn lstm [num-hidden in-data prev-state param seq-idx layer-idx dropout]
-  (let [in-dataa (if (pos? dropout)
-                   (sym/dropout {:data in-data :p dropout})
-                   in-data)
-        i2h (sym/fully-connected (str "t" seq-idx "_l" layer-idx "_i2h")
-                                 {:data in-dataa :weight (:i2h-weight param)
-                                  :bias (:i2h-bias param) :num-hidden (* num-hidden 4)})
-        h2h (sym/fully-connected (str "t" seq-idx "_l" layer-idx "_h2h")
-                                 {:data (:h prev-state) :weight (:h2h-weight param)
-                                  :bias (:h2h-bias param) :num-hidden (* num-hidden 4)})
-        gates (sym/+ i2h h2h)
-        slice-gates (sym/slice-channel (str "t" seq-idx "_l" layer-idx "_slice")
-                                       {:data gates :num-outputs 4})
-        in-gate (sym/activation {:data (sym/get slice-gates 0) :act-type "sigmoid"})
-        in-transform (sym/activation {:data (sym/get slice-gates 1) :act-type "tanh"})
-        forget-gate (sym/activation {:data (sym/get slice-gates 2) :act-type "sigmoid"})
-        out-gate (sym/activation {:data (sym/get slice-gates 3) :act-type "sigmoid"})
-        next-c (sym/+ (sym/* forget-gate (:c prev-state))
-                      (sym/* in-gate in-transform))
-        next-h (sym/* out-gate (sym/activation {:data next-c :act-type "tanh"}))]
-    (lstm-state next-c next-h)))
-
-(defn lstm-unroll [num-lstm-layer seq-len input-size num-hidden num-embed num-label dropout]
-  (let [embed-weight (sym/variable "embed_weight")
-        cls-weight (sym/variable "cls_weight")
-        cls-bias (sym/variable "cls_bias")
-        param-cells (mapv (fn [i]
-                            (lstm-param (sym/variable (str "l" i "_i2h_weight"))
-                                        (sym/variable (str "l" i "_i2h_bias"))
-                                        (sym/variable (str "l" i "_h2h_weight"))
-                                        (sym/variable (str "l" i "_h2h_bias"))))
-                          (range 0 num-lstm-layer))
-        last-states (mapv (fn [i]
-                            (lstm-state (sym/variable (str "l" i "_init_c_beta"))
-                                        (sym/variable (str "l" i "_init_h_beta"))))
-                          (range 0 num-lstm-layer))
-        ;; embedding layer
-        data (sym/variable "data")
-        label (sym/variable "softmax_label")
-        embed (sym/embedding "embed" {:data data :input-dim input-size :weight embed-weight
-                                      :output-dim num-embed})
-        wordvec (sym/slice-channel {:data embed :num-outputs seq-len :squeeze-axis 1})
-        dp-ratio 0
-        ;; stack lstm
-        hidden-all (doall (for [seq-idx (range seq-len)]
-                            (let [hidden (:h (last (loop [i 0
-                                                          hidden (sym/get wordvec seq-idx)
-                                                          next-states []]
-                                                     (if (= i num-lstm-layer)
-                                                       next-states
-                                                       (let [dp-ratio (if (zero? i) 0 dropout)
-                                                             next-state (lstm num-hidden
-                                                                              hidden
-                                                                              (get last-states i)
-                                                                              (get param-cells i)
-                                                                              seq-idx
-                                                                              i
-                                                                              dp-ratio)]
-                                                         (recur (inc i)
-                                                                (:h next-state)
-                                                                (conj next-states next-state)))))))]
-                              (if (pos? dropout)
-                                (sym/dropout {:data hidden :p dropout})
-                                hidden))))
-        hidden-concat (sym/concat "concat" nil hidden-all {:dim 0})
-        pred (sym/fully-connected "pred" {:data hidden-concat :num-hidden num-label
-                                          :weight cls-weight :bias cls-bias})
-        label (sym/transpose {:data label})
-        label (sym/reshape {:data label :target-shape [0]})
-        sm (sym/softmax-output "softmax" {:data pred :label label})]
-    sm))
-
-(defn lstm-inference-symbol [num-lstm-layer input-size num-hidden
-                             num-embed num-label dropout]
-  (let [seq-idx 0
-        embed-weight (sym/variable "embed_weight")
-        cls-weight (sym/variable "cls_weight")
-        cls-bias (sym/variable "cls_bias")
-        param-cells (mapv (fn [i]
-                            (lstm-param (sym/variable (str "l" i "_i2h_weight"))
-                                        (sym/variable (str "l" i "_i2h_bias"))
-                                        (sym/variable (str "l" i "_h2h_weight"))
-                                        (sym/variable (str "l" i "_h2h_bias"))))
-                          (range 0 num-lstm-layer))
-        last-states (mapv (fn [i]
-                            (lstm-state (sym/variable (str "l" i "_init_c_beta"))
-                                        (sym/variable (str "l" i "_init_h_beta"))))
-                          (range 0 num-lstm-layer))
-        data (sym/variable "data")
-        dp-ratio 0
-        ;; stack lstm
-        next-states (loop [i 0
-                           hidden (sym/embedding "embed" {:data data :input-dim input-size :weight embed-weight :output-dim num-embed})
-                           next-states []]
-                      (if (= i num-lstm-layer)
-                        next-states
-                        (let [dp-ratio (if (zero? i) 0 dropout)
-                              next-state (lstm num-hidden
-                                               hidden
-                                               (get last-states i)
-                                               (get param-cells i)
-                                               seq-idx
-                                               i
-                                               dp-ratio)]
-                          (recur (inc i)
-                                 (:h next-state)
-                                 (conj next-states next-state)))))
-        ;;; decoder
-        hidden (:h (last next-states))
-        hidden (if (pos? dropout) (sym/dropout {:data hidden :p dropout}) hidden)
-        fc (sym/fully-connected "pred" {:data hidden :num-hidden num-label
-                                        :weight cls-weight :bias cls-bias})
-        sm (sym/softmax-output "softmax" {:data fc})
-        outs (into [sm] (mapcat (fn [next-s] (vals next-s)) next-states))]
-    (sym/group outs)))
-
-(defn lstm-inference-model [{:keys [num-lstm-layer input-size num-hidden
-                                    num-embed num-label arg-params
-                                    ctx dropout]
-                             :or {ctx (context/cpu)
-                                  dropout 0.0}}]
-
-  (let [lstm-sym (lstm-inference-symbol num-lstm-layer
-                                        input-size
-                                        num-hidden
-                                        num-embed
-                                        num-label
-                                        dropout)
-        batch-size 1
-        init-c (into {} (map (fn [l]
-                               {(str "l" l "_init_c_beta") [batch-size num-hidden]})
-                             (range num-lstm-layer)))
-        init-h (into {} (map (fn [l]
-                               {(str "l" l "_init_h_beta") [batch-size num-hidden]}))
-                     (range num-lstm-layer))
-        data-shape {"data" [batch-size]}
-        input-shape (merge init-c init-h data-shape)
-        exec (sym/simple-bind lstm-sym ctx input-shape)
-        exec-arg-map (executor/arg-map exec)
-        states-map (zipmap (mapcat (fn [i] [(str "l" i "_init_c_beta")
-                                            (str "l" i "_init_h_beta")])
-                                   (range num-lstm-layer))
-                           (rest (executor/outputs exec)))]
-    (doseq [[k v] arg-params]
-      (if-let [target-v (get exec-arg-map k)]
-        (when (and (not (get input-shape k))
-                   (not= "softmax_label" k))
-          (ndarray/copy-to v target-v))))
-    {:exec exec
-     :states-map states-map}))
-
-(defn forward [{:keys [exec states-map] :as lstm-model} input-data new-seq]
-  (when new-seq
-    (doseq [[k v] states-map]
-      (ndarray/set (get (executor/arg-map exec) k) 0)))
-  (do
-    (ndarray/copy-to input-data (get (executor/arg-map exec) "data"))
-    (executor/forward exec)
-    (doseq [[k v] states-map]
-      (ndarray/copy-to v (get (executor/arg-map exec) k)))
-    (first (executor/outputs exec))))
diff --git a/contrib/clojure-package/examples/rnn/src/rnn/test_char_rnn.clj b/contrib/clojure-package/examples/rnn/src/rnn/test_char_rnn.clj
deleted file mode 100644
index 22a2982f222b..000000000000
--- a/contrib/clojure-package/examples/rnn/src/rnn/test_char_rnn.clj
+++ /dev/null
@@ -1,82 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns rnn.test-char-rnn
-  (:require [clojure.string :as string]
-            [clojure.java.shell :refer [sh]]
-            [rnn.util :as util]
-            [rnn.lstm :as lstm]
-            [org.apache.clojure-mxnet.context :as context]
-            [org.apache.clojure-mxnet.executor :as executor]
-            [org.apache.clojure-mxnet.module :as m]
-            [org.apache.clojure-mxnet.ndarray :as ndarray]))
-
-(when-not (.exists (clojure.java.io/file "data"))
-  (do (println "Retrieving data...") (sh "./get_data.sh")))
-
-(def data-path "data/obama.txt")
-(def model-prefix)
-(def start-sentence "The joke ")
-(def num-hidden 512) ;; hidden unit in LSTM cell
-(def num-embed 256) ;; the embedding dim (a char is mapped to 256 dim)
-(def num-lstm-layer 3) ;; number of lstm layers
-
-(def vocab (util/build-vocab data-path))
-
-(defn rnn-test [model-prefix epoch-num seq-length random?]
-  (let [trained-mod (m/load-checkpoint {:prefix model-prefix :epoch epoch-num})
-        trained-arg-params (m/arg-params trained-mod)
-        model (lstm/lstm-inference-model {:num-lstm-layer 3
-                                          :input-size (inc (count vocab))
-                                          :num-label (inc (count vocab))
-                                          :num-hidden num-hidden
-                                          :num-embed num-embed
-                                          :arg-params trained-arg-params})
-        input-ndarray (ndarray/zeros [1])
-        revert-vocab (util/make-revert-vocab vocab)
-        fix-dict (into [""]
-                       (mapv #(str (get revert-vocab %))
-                             (sort (vals vocab))))
-        random-sample random? ;; use this to do random sample or max prob
-        ignore-length (count start-sentence)]
-    (println "Starter sentence: " start-sentence)
-    (println "===")
-    (loop [i 0
-           new-sentence true
-           output start-sentence]
-      (if (= seq-length i)
-        output
-        (do
-          (if (<= i (dec ignore-length))
-            (util/make-input (get start-sentence i) vocab input-ndarray)
-            (util/make-input (last output) vocab input-ndarray))
-          (let [prob (ndarray/->vec (lstm/forward model input-ndarray new-sentence))
-                next-char (util/make-output prob fix-dict random-sample)]
-            (recur (inc i)
-                   (if (= "" next-char) true false)
-                   (if (< i (dec ignore-length))
-                     output
-                     (str output next-char)))))))))
-
-(comment
-
-  (rnn-test "data/obama" 75 200 false)
-  ;=>"The joke that we can start by the challenges of the American people. The American people have been talking about how to compete with the streets of San Antonio who the courage to come together as one "
-
-  (rnn-test "data/obama" 75 200 true)
-  ;=>"The joke before them prepared for five years ago, we only hear a chance to lose our efforts and they made striggling procedural deficit at the city between a politics in the efforts on the Edmund Pett"
-)
diff --git a/contrib/clojure-package/examples/rnn/src/rnn/train_char_rnn.clj b/contrib/clojure-package/examples/rnn/src/rnn/train_char_rnn.clj
deleted file mode 100644
index 41a764f7af95..000000000000
--- a/contrib/clojure-package/examples/rnn/src/rnn/train_char_rnn.clj
+++ /dev/null
@@ -1,181 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns rnn.train-char-rnn
-  (:require  [clojure.string :as string]
-             [clojure.java.shell :refer [sh]]
-             [rnn.util :as util]
-             [rnn.lstm :as lstm]
-             [rnn.test-char-rnn :as test-rnn]
-             [org.apache.clojure-mxnet.context :as context]
-             [org.apache.clojure-mxnet.callback :as callback]
-             [org.apache.clojure-mxnet.executor :as executor]
-             [org.apache.clojure-mxnet.eval-metric :as eval-metric]
-             [org.apache.clojure-mxnet.io :as mx-io]
-             [org.apache.clojure-mxnet.initializer :as init]
-             [org.apache.clojure-mxnet.ndarray :as ndarray]
-             [org.apache.clojure-mxnet.optimizer :as optimizer]
-             [org.apache.clojure-mxnet.symbol :as sym]
-             [org.apache.clojure-mxnet.module :as m])
-  (:gen-class))
-
-;;https://github.com/apache/incubator-mxnet/blob/master/example/rnn/old/char-rnn.ipynb
-
-(when-not (.exists (clojure.java.io/file "data"))
-  (do (println "Retrieving data...") (sh "./get_data.sh")))
-
-;; batch size for training
-(def batch-size 32)
-;; we can support various length input
-;; for this problem, we cut each input sentence to length of 129
-;; so we only need a fixed lenght bucket
-(def buckets [129])
-;;hidden unit in LSTM cell
-(def num-hidden 512)
-;; embedding dim which is map a char to a 256 dim vector
-(def num-embed 256)
-;; number of lstm layer
-(def num-lstm-layer 3)
-;; we will show a quick demo in 2 epoch and we will see the result
-;; by training 75 epoch
-(def num-epoch 75)
-;; learning rate
-(def learning-rate 0.01)
-;; we will use pure sgd without momentum
-(def momentum 0.0)
-
-(def ctx (context/cpu)) ;; change to gpu if desired
-(def data-path "data/obama.txt")
-(def vocab (util/build-vocab data-path))
-
-;; generate the symbol for a length
-(defn sym-gen [seq-len]
-  (lstm/lstm-unroll num-lstm-layer seq-len (inc (count vocab))
-                    num-hidden num-embed (inc (count vocab)) 0.2))
-
-;;; in the case of this fixed bucketing that only uses one bucket size - it is the equivalent of padpadding all sentences to a fixed length.
-;; we are going to use ndarray-iter for this
-;; converting the bucketing-iter over to use is todo. We could either push for the example Scala one to be included in the base package and interop with that (which would be nice for other rnn needs too) or hand convert it over ourselves
-
-
-(defn build-training-data [path]
-  (let [content (slurp path)
-        sentences (string/split content #"\n")
-        max-length (first buckets)
-        padding-int 0]
-    (doall (for [sentence sentences]
-             (let [ids (mapv #(get vocab %) sentence)]
-               (if (>= (count ids) max-length)
-                 (into [] (take max-length ids))
-                 (into ids (repeat (- max-length (count ids)) 0))))))))
-
-(defn build-labels [train-data]
-    ;; want to learn the next char some rotate by 1
-  (doall (mapv (fn [sent-data] (conj (into [] (rest sent-data)) 0))
-               train-data)))
-
-(defn data-desc->map [data-desc]
-  (->>  data-desc
-        (map vals)
-        (first)
-        (apply hash-map)))
-
-(defn train [devs]
-  (let [;; initialize the states for the lstm
-        init-c (into {} (map (fn [l]
-                               {(str "l" l "_init_c_beta") [batch-size num-hidden]})
-                             (range num-lstm-layer)))
-        init-h (into {} (map (fn [l]
-                               {(str "l" l "_init_h_beta") [batch-size num-hidden]}))
-                     (range num-lstm-layer))
-        init-states (merge init-c init-h)
-        train-data (build-training-data data-path)
-        labels (build-labels train-data)
-        sent-len (first buckets)
-        train-iter (mx-io/ndarray-iter [(ndarray/array (flatten train-data)
-                                                       [(count train-data) sent-len])]
-                                       {:label [(ndarray/array (flatten labels)
-                                                               [(count labels) sent-len])]
-                                        :label-name "softmax_label"
-                                        :data-batch-size batch-size
-                                        :last-batch-handle "pad"})
-        data-and-labels (merge (data-desc->map (mx-io/provide-data-desc train-iter))
-                               (data-desc->map (mx-io/provide-label-desc train-iter))
-                               init-states)
-        init-states-data (mapv (fn [[k v]] (ndarray/zeros v {:ctx ctx})) init-states)
-        rnn-sym (sym-gen (first buckets))
-
-        rnn-mod (-> (m/module rnn-sym {:contexts devs})
-                    (m/bind {:data-shapes (into (mx-io/provide-data-desc train-iter)
-                                                (mapv (fn [[k v]] {:name k :shape v}) init-states))
-                             :label-shapes (mx-io/provide-label-desc train-iter)})
-                    (m/init-params {:initializer (init/xavier {:factor-type "in" :magnitude 2.34})})
-                    (m/init-optimizer {:optimizer (optimizer/adam {:learning-rate learning-rate :wd 0.0001})}))
-        metric (eval-metric/custom-metric
-                (fn [label pred]
-                  (let [labels (ndarray/->vec (ndarray/transpose label))
-                        pred-shape (ndarray/shape-vec pred)
-                        size (apply * (ndarray/shape-vec label))
-                        preds (mapv #(into [] %) (doall
-                                                  (partition (last pred-shape) (ndarray/->vec pred))))
-                        results (map-indexed
-                                 (fn [i l]
-                                   (get-in preds [i (int l)]))
-                                 labels)
-                        result (->> results
-                                    (mapv #(Math/max (float 1e-10) (float %)))
-                                    (mapv #(Math/log %))
-                                    (mapv #(* -1.0 %))
-                                    (apply +))]
-                    (float (Math/exp (/ result (count labels))))))
-
-                "perplexity")]
-
-    ;; Train for 1 epochs and then show the results of 75
-    (doseq [epoch-num (range 1)]
-      (println "Doing epoch " epoch-num)
-      (mx-io/reduce-batches
-       train-iter
-       (fn [batch-num batch]
-         (let [batch (mx-io/next train-iter)]
-           (-> rnn-mod
-               (m/forward (mx-io/data-batch {:data (into (mx-io/batch-data batch) init-states-data)
-                                             :label (mx-io/batch-label batch)}))
-               (m/update-metric metric (mx-io/batch-label batch))
-               (m/backward)
-               (m/update))
-           (when (zero? (mod batch-num 10))
-             (println "Eval metric for batch-num " batch-num " is " (eval-metric/get metric)))
-           (inc batch-num))))
-      (println "Finished epoch " epoch-num)
-      #_(println "Eval-metric " (eval-metric/get-and-reset metric))
-      (m/save-checkpoint rnn-mod {:prefix "train-obama" :epoch epoch-num})
-      (println "Testing with random 200 chars ")
-      (println "=====")
-      (println  (test-rnn/rnn-test "train-obama" epoch-num 200 true))
-      (println "====="))
-
-    (println "Showing the result after 75 epochs (pre-trained)")
-    (println (test-rnn/rnn-test "data/obama" 75 200 true))
-    (println "=====")))
-
-(defn -main [& args]
-  (let [[dev dev-num] args
-        devs (if (= dev ":gpu")
-               (mapv #(context/gpu %) (range (Integer/parseInt (or dev-num "1"))))
-               (mapv #(context/cpu %) (range (Integer/parseInt (or dev-num "1")))))]
-    (train devs)))
diff --git a/contrib/clojure-package/examples/rnn/src/rnn/util.clj b/contrib/clojure-package/examples/rnn/src/rnn/util.clj
deleted file mode 100644
index bce5bb710ada..000000000000
--- a/contrib/clojure-package/examples/rnn/src/rnn/util.clj
+++ /dev/null
@@ -1,74 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns rnn.util
-  (:require [org.apache.clojure-mxnet.ndarray :as ndarray]))
-
-(defn build-vocab [path]
-  (let [content (slurp path)
-        vocab-map (reduce (fn [{:keys [vocab idx] :as result} c]
-                            (if (get vocab c)
-                              result
-                              (-> result
-                                  (update :vocab assoc c (inc idx))
-                                  (update :idx inc))))
-                          {:vocab {} :idx 0}     ;; 0 is used for padding
-                          content)]
-    (:vocab vocab-map)))
-
-(defn make-revert-vocab [vmap]
-  (into {} (map (fn [[k v]] [v k]) vmap)))
-
-(defn make-input [char vocab arr]
-  (let [idx (get vocab char)
-        tmp (ndarray/zeros [1])]
-    (do
-      (ndarray/set tmp idx)
-      (ndarray/set arr tmp))))
-
-(defn cdf [weights]
-  (let [total (* 1.0 (apply + weights))
-        csums (reduce (fn [cumsum w] (conj cumsum (+ (or (last cumsum) 0) w))) [] weights)]
-    (mapv #(/ % total) csums)))
-
-(defn choice [population weights]
-  (assert (= (count population) (count weights)))
-  (let [cdf-vals (cdf weights)
-        x (rand)
-        idx (-> (partition-by (fn [v] (>= v x)) cdf-vals)
-                first
-                count)]
-    (get population idx)))
-
-;; we can use random output of fixed-output by choosing the largest probability
-(defn make-output [prob fix-dict sample]
-  (let [temperature 1.0
-        char (if sample
-               (let [scale-prob (mapv (fn [x] (if (< x 1e-6)
-                                                1e-6
-                                                (if (> x (- 1 1e-6))
-                                                  (- 1 1e-6)
-                                                  x))) prob)
-                     rescale (mapv (fn [x] (Math/exp (/ (Math/log x) temperature))) scale-prob)
-                     sum (apply + rescale)
-                     rescale (map (fn [x] (/ x sum)) rescale)]
-                 (choice fix-dict rescale))
-               (->> (zipmap prob fix-dict)
-                    (sort-by max)
-                    (vals)
-                    last))]
-    char))
diff --git a/contrib/clojure-package/examples/rnn/test/rnn/core_test.clj b/contrib/clojure-package/examples/rnn/test/rnn/core_test.clj
deleted file mode 100644
index b198577241c3..000000000000
--- a/contrib/clojure-package/examples/rnn/test/rnn/core_test.clj
+++ /dev/null
@@ -1,26 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns rnn.core_test
- (:require 
- 	[rnn.test-char-rnn :as rnn]
- 	[clojure.test :refer :all]))
-
-(deftest check-trained-network
-	(is (= 
-		"The joke that we can start by the challenges of the American people. The American people have been talking about how to compete with the streets of San Antonio who the courage to come together as one "
-	 (rnn/rnn-test "data/obama" 75 200 false))))
\ No newline at end of file
diff --git a/contrib/clojure-package/examples/tutorial/src/tutorial/module.clj b/contrib/clojure-package/examples/tutorial/src/tutorial/module.clj
deleted file mode 100644
index 4923cdcc0684..000000000000
--- a/contrib/clojure-package/examples/tutorial/src/tutorial/module.clj
+++ /dev/null
@@ -1,286 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns tutorial.module
-  "A REPL tutorial of the MXNet Clojure API for Module, based on
-  https://mxnet.apache.org/api/clojure/module.html"
-  (:require [clojure.java.io :as io]
-            [clojure.java.shell :refer [sh]]
-            [org.apache.clojure-mxnet.eval-metric :as eval-metric]
-            [org.apache.clojure-mxnet.io :as mx-io]
-            [org.apache.clojure-mxnet.module :as m]
-            [org.apache.clojure-mxnet.symbol :as sym]
-            [org.apache.clojure-mxnet.ndarray :as ndarray]))
-
-
-;; The Module API provides an intermediate and high-level interface
-;; for performing computation with neural networks in MXNet. Module
-;; wraps a Symbol and one or more Executors. It has both a high level
-;; and intermediate level API.
-
-
-;;;; Prepare the Data
-
-;; In this example, we are going to use the MNIST data set. If you
-;; start, we can run some helper scripts to download the data for us.
-
-(def data-dir "data/")
-
-(when-not (.exists (io/file (str data-dir "train-images-idx3-ubyte")))
-  (sh "../../scripts/get_mnist_data.sh"))
-
-;; MXNet provides function in the `io` namespace to load the MNIST
-;; datasets into training and test data iterators that we can use with
-;; our module.
-(def train-data (mx-io/mnist-iter {:image (str data-dir "train-images-idx3-ubyte")
-                                   :label (str data-dir "train-labels-idx1-ubyte")
-                                   :label-name "softmax_label"
-                                   :input-shape [784]
-                                   :batch-size 10
-                                   :shuffle true
-                                   :flat true
-                                   :silent false
-                                   :seed 10}))
-
-(def test-data (mx-io/mnist-iter {:image (str data-dir "t10k-images-idx3-ubyte")
-                                  :label (str data-dir "t10k-labels-idx1-ubyte")
-                                  :input-shape [784]
-                                  :batch-size 10
-                                  :flat true
-                                  :silent false}))
-
-
-;;;; Preparing a module for Computation
-
-;; To construct a module, we need to have a symbol as input. This
-;; symbol takes input data in the first layer and then has subsequent
-;; layers of fully connected and relu activation layers, ending up in
-;; a softmax layer for output.
-
-(let [data (sym/variable "data")
-      fc1 (sym/fully-connected "fc1" {:data data :num-hidden 128})
-      act1 (sym/activation "relu1" {:data fc1 :act-type "relu"})
-      fc2 (sym/fully-connected "fc2" {:data act1 :num-hidden 64})
-      act2 (sym/activation "relu2" {:data fc2 :act-type "relu"})
-      fc3 (sym/fully-connected "fc3" {:data act2 :num-hidden 10})
-      out (sym/softmax-output "softmax" {:data fc3})]
-  out) ;=>#object[org.apache.mxnet.Symbol 0x1f43a406 "org.apache.mxnet.Symbol@1f43a406"]
-
-;; You can also write this with the `as->` threading macro.
-
-
-(def out (as-> (sym/variable "data") data
-           (sym/fully-connected "fc1" {:data data :num-hidden 128})
-           (sym/activation "relu1" {:data data :act-type "relu"})
-           (sym/fully-connected "fc2" {:data data :num-hidden 64})
-           (sym/activation "relu2" {:data data :act-type "relu"})
-           (sym/fully-connected "fc3" {:data data :num-hidden 10})
-           (sym/softmax-output "softmax" {:data data})))
-;=> #'tutorial.module/out
-
-
-;; By default, context is the CPU. If you need data parallelization,
-;; you can specify a GPU context or an array of GPU contexts, like
-;; this: `(m/module out {:contexts [(context/gpu)]})`
-
-;; Before you can compute with a module, you need to call `bind` to
-;; allocate the device memory and `initParams` or `set-params` to
-;; initialize the parameters. If you simply want to fit a module, you
-;; don’t need to call `bind` and `init-params` explicitly, because the
-;; `fit` function automatically calls them if they are needed.
-
-(let [mod (m/module out)]
-  (-> mod
-      (m/bind {:data-shapes (mx-io/provide-data train-data)
-               :label-shapes (mx-io/provide-label train-data)})
-      (m/init-params)))
-
-;; Now you can compute with the module using functions like `forward`,
-;; `backward`, etc.
-
-
-;;;; Training and Predicting
-
-;; Modules provide high-level APIs for training, predicting, and
-;; evaluating. To fit a module, call the `fit` function with some data
-;; iterators:
-
-(def mod
-  (m/fit (m/module out) {:train-data train-data
-                         :eval-data test-data
-                         :num-epoch 1}))
-;; =>
-;; Epoch  0  Train- [accuracy 0.12521666]
-;; Epoch  0  Time cost- 8392
-;; Epoch  0  Validation-  [accuracy 0.2227]
-
-
-;; You can pass in batch-end callbacks using batch-end-callback and
-;; epoch-end callbacks using epoch-end-callback in the
-;; `fit-params`. You can also set parameters using functions like in
-;; the fit-params like optimizer and eval-metric. To learn more about
-;; the fit-params, see the fit-param function options. To predict with
-;; a module, call `predict` with a DataIter:
-
-(def results
-  (m/predict mod {:eval-data test-data}))
-
-(first results) ;=>#object[org.apache.mxnet.NDArray 0x3540b6d3 "org.apache.mxnet.NDArray@a48686ec"]
-
-(first (ndarray/->vec (first results))) ;=>0.08261358
-
-;; The module collects and returns all of the prediction results. For
-;; more details about the format of the return values, see the
-;; documentation for the `predict` function.
-
-;; When prediction results might be too large to fit in memory, use
-;; the `predict-every-batch` API.
-
-(let [preds (m/predict-every-batch mod {:eval-data test-data})]
-  (mx-io/reduce-batches test-data
-                        (fn [i batch]
-                          (println (str "pred is " (first (get preds i))))
-                          (println (str "label is " (mx-io/batch-label batch)))
-                          ;;; do something
-                          (inc i))))
-
-;; If you need to evaluate on a test set and don’t need the prediction
-;; output, call the `score` function with a data iterator and an eval
-;; metric:
-
-(m/score mod {:eval-data test-data
-              :eval-metric (eval-metric/accuracy)}) ;=>["accuracy" 0.2227]
-
-;; This runs predictions on each batch in the provided DataIter and
-;; computes the evaluation score using the provided EvalMetric. The
-;; evaluation results are stored in metric so that you can query
-;; later.
-
-
-
-;;;; Saving and Loading
-
-;; To save the module parameters in each training epoch, use the
-;; `save-checkpoint` function:
-
-(let [save-prefix "my-model"]
-  (doseq [epoch-num (range 3)]
-    (mx-io/do-batches train-data (fn [batch
-                                          ;; do something
-                                     ]))
-    (m/save-checkpoint mod {:prefix save-prefix
-                            :epoch epoch-num
-                            :save-opt-states true}))) 
-
-;; INFO  org.apache.mxnet.module.Module: Saved checkpoint to my-model-0000.params
-;; INFO  org.apache.mxnet.module.Module: Saved optimizer state to my-model-0000.states
-;; INFO  org.apache.mxnet.module.Module: Saved checkpoint to my-model-0001.params
-;; INFO  org.apache.mxnet.module.Module: Saved optimizer state to my-model-0001.states
-;; INFO  org.apache.mxnet.module.Module: Saved checkpoint to my-model-0002.params
-;; INFO  org.apache.mxnet.module.Module: Saved optimizer state to my-model-0002.states
-
-
-;; To load the saved module parameters, call the `load-checkpoint`
-;; function:
-
-(def new-mod (m/load-checkpoint {:prefix "my-model" :epoch 1 :load-optimizer-states true}))
-
-new-mod ;=> #object[org.apache.mxnet.module.Module 0x5304d0f4 "org.apache.mxnet.module.Module@5304d0f4"]
-
-;; To initialize parameters, bind the symbols to construct executors
-;; first with the `bind` function. Then, initialize the parameters and
-;; auxiliary states by calling the `init-params` function.\
-(-> new-mod
-    (m/bind {:data-shapes (mx-io/provide-data train-data)
-             :label-shapes (mx-io/provide-label train-data)})
-    (m/init-params))
-
-;; To get current parameters, use `params`
-(let [[arg-params aux-params] (m/params new-mod)]
-  {:arg-params arg-params
-   :aux-params aux-params})
-
-;; {:arg-params
-;;  {"fc3_bias"
-;;   #object[org.apache.mxnet.NDArray 0x39adc3b0 "org.apache.mxnet.NDArray@49caf426"],
-;;   "fc2_weight"
-;;   #object[org.apache.mxnet.NDArray 0x25baf623 "org.apache.mxnet.NDArray@a6c8f9ac"],
-;;   "fc1_bias"
-;;   #object[org.apache.mxnet.NDArray 0x6e089973 "org.apache.mxnet.NDArray@9f91d6eb"],
-;;   "fc3_weight"
-;;   #object[org.apache.mxnet.NDArray 0x756fd109 "org.apache.mxnet.NDArray@2dd0fe3c"],
-;;   "fc2_bias"
-;;   #object[org.apache.mxnet.NDArray 0x1dc69c8b "org.apache.mxnet.NDArray@d128f73d"],
-;;   "fc1_weight"
-;;   #object[org.apache.mxnet.NDArray 0x20abc769 "org.apache.mxnet.NDArray@b8e1c5e8"]},
-;;  :aux-params {}}
-
-
-;; To assign parameter and aux state values, use the `set-params`
-;; function:
-(m/set-params new-mod {:arg-params (m/arg-params new-mod)
-                       :aux-params (m/aux-params new-mod)})
-
-
-;; To resume training from a saved checkpoint, pass the loaded
-;; parameters to the `fit` function. This will prevent `fit` from
-;; initializing randomly.
-
-;; (First, reset the training data before calling `fit` or you will
-;; get an error)
-(mx-io/reset train-data)
-(mx-io/reset test-data)
-
-;; Create `fit-params` and then use it to set `begin-epoch` so that
-;; `fit` knows to resume from a saved epoch.
-
-
-(comment 
-;; FIXME
-; Caused by: java.io.EOFException
-;   at java.io.DataInputStream.readInt(DataInputStream.java:392)
-;   at java.io.ObjectInputStream$BlockDataInputStream.readInt(ObjectInputStream.java:3182)
-;   at java.io.ObjectInputStream.readInt(ObjectInputStream.java:1032)
-;   at org.apache.mxnet.Optimizer$$anon$1$$anonfun$deserializeState$1.apply$mcVI$sp(Optimizer.scala:84)
-;   at scala.collection.immutable.Range.foreach$mVc$sp(Range.scala:160)
-;   at org.apache.mxnet.Optimizer$$anon$1.deserializeState(Optimizer.scala:83)
-;   at org.apache.mxnet.module.Module$$anonfun$loadOptimizerStates$3.apply(Module.scala:594)
-;   at org.apache.mxnet.module.Module$$anonfun$loadOptimizerStates$3.apply(Module.scala:589)
-;   at scala.Option.foreach(Option.scala:257)
-;   at org.apache.mxnet.module.Module.loadOptimizerStates(Module.scala:589)
-;   at org.apache.mxnet.module.Module$$anonfun$initOptimizer$4.apply(Module.scala:407)
-;   at org.apache.mxnet.module.Module$$anonfun$initOptimizer$4.apply(Module.scala:406)
-;   at scala.Option.foreach(Option.scala:257)
-;   at org.apache.mxnet.module.Module.initOptimizer(Module.scala:406)
-;   at org.apache.mxnet.module.BaseModule.fit(BaseModule.scala:407)
-;   at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
-;   at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
-;   at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
-;   at java.lang.reflect.Method.invoke(Method.java:498)
-;   at clojure.lang.Reflector.invokeMatchingMethod(Reflector.java:93)
-;   at clojure.lang.Reflector.invokeInstanceMethod(Reflector.java:28)
-;   at org.apache.clojure_mxnet.module$fit.invokeStatic(module.clj:551)
-;   at org.apache.clojure_mxnet.module$fit.invoke(module.clj:538)
-;   at tutorial.module$eval1787.invokeStatic(module.clj:250)
-;   at tutorial.module$eval1787.invoke(module.clj:250)
-
-(m/fit new-mod {:train-data train-data
-                :eval-data test-data
-                :num-epoch 2
-                :fit-params (m/fit-params {:begin-epoch 1})})
-
-)
\ No newline at end of file
diff --git a/contrib/clojure-package/examples/tutorial/src/tutorial/symbol.clj b/contrib/clojure-package/examples/tutorial/src/tutorial/symbol.clj
index 73933b4395c5..0dc45dc095ef 100644
--- a/contrib/clojure-package/examples/tutorial/src/tutorial/symbol.clj
+++ b/contrib/clojure-package/examples/tutorial/src/tutorial/symbol.clj
@@ -30,23 +30,6 @@
 ;; graphs. You can configure the graphs either at the level of neural
 ;; network layer operations or as fine-grained operations.
 
-;; The following example configures a two-layer neural network.
-(def data (sym/variable "data"))
-(def fc1 (sym/fully-connected "fc1" {:data data :num-hidden 128}))
-(def act1 (sym/activation "act1" {:data fc1 :act-type "relu"}))
-(def fc2 (sym/fully-connected "fc2" {:data act1 :num-hidden 64}))
-(def net (sym/softmax-output "out" {:data fc2}))
-
-;; This can also be combined more dynamically with the `as->` Clojure
-;; threading form.
-(as-> (sym/variable "data") data
-  (sym/fully-connected "fc1" {:data data :num-hidden 128})
-  (sym/activation "act1"     {:data data :act-type "relu"})
-  (sym/fully-connected "fc2" {:data data :num-hidden 64})
-  (sym/softmax-output "out"  {:data data}))
-
-net ;=> #object[org.apache.mxnet.Symbol 0x5c78c8c2 "org.apache.mxnet.Symbol@5c78c8c2"] 
-
 ;; The basic arithmetic operators (plus, minus, div, multiplication)
 ;; work as expected. The following example creates a computation graph
 ;; that adds two inputs together.
@@ -74,17 +57,7 @@ net ;=> #object[org.apache.mxnet.Symbol 0x5c78c8c2 "org.apache.mxnet.Symbol@5c78
 ;;;; Group Multiple Symbols
 
 ;; To construct neural networks with multiple loss layers, we can use
-;; `group` to group multiple symbols together. The following example
-;; groups two outputs:
-
-(def net (sym/variable "data"))
-(def fc1 (sym/fully-connected {:data net :num-hidden 128}))
-(def net2 (sym/activation {:data fc1 :act-type "relu"}))
-(def out1 (sym/softmax-output {:data net2}))
-(def out2 (sym/linear-regression-output {:data net2}))
-(def group (sym/group [out1 out2]))
-(sym/list-outputs group) ;=> ["softmaxoutput0_output" "linearregressionoutput0_output"]
-
+;; `group` to group multiple symbols together.
 
 ;;;; Serialization
 
diff --git a/contrib/clojure-package/examples/tutorial/test/tutorial/core_test.clj b/contrib/clojure-package/examples/tutorial/test/tutorial/core_test.clj
deleted file mode 100644
index 0e5169c5cfaa..000000000000
--- a/contrib/clojure-package/examples/tutorial/test/tutorial/core_test.clj
+++ /dev/null
@@ -1,27 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns tutorial.core_test
- (:require [clojure.test :refer :all])
- (:require 
- 	[tutorial.introduction]
- 	[tutorial.kvstore]
- 	[tutorial.module]
- 	[tutorial.ndarray]
- 	[tutorial.symbol]))
-
-(deftest if-this-goes-here-then-tutorials-have-loaded-properly (is true))
\ No newline at end of file
diff --git a/contrib/clojure-package/examples/visualization/.gitignore b/contrib/clojure-package/examples/visualization/.gitignore
deleted file mode 100644
index c53038ec0e3d..000000000000
--- a/contrib/clojure-package/examples/visualization/.gitignore
+++ /dev/null
@@ -1,11 +0,0 @@
-/target
-/classes
-/checkouts
-pom.xml
-pom.xml.asc
-*.jar
-*.class
-/.lein-*
-/.nrepl-port
-.hgignore
-.hg/
diff --git a/contrib/clojure-package/examples/visualization/README.md b/contrib/clojure-package/examples/visualization/README.md
deleted file mode 100644
index c9eb75f6d496..000000000000
--- a/contrib/clojure-package/examples/visualization/README.md
+++ /dev/null
@@ -1,31 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# visualization
-
-
-
-## Installation
-
-Before you run this example, make sure that you have the clojure package installed.
-In the main clojure package directory, do `lein install`. Then you can run
-`lein install` in this directory.
-
-## Usage
-
-Run `lein run` to have a sample network visualization printed for you
-"testviz.pdf"
diff --git a/contrib/clojure-package/examples/visualization/project.clj b/contrib/clojure-package/examples/visualization/project.clj
deleted file mode 100644
index c19e19d6fd02..000000000000
--- a/contrib/clojure-package/examples/visualization/project.clj
+++ /dev/null
@@ -1,23 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(defproject visualization "0.1.0-SNAPSHOT"
-  :description "Visualization example"
-  :plugins [[lein-cljfmt "0.5.7"]]
-  :dependencies [[org.clojure/clojure "1.9.0"]
-                 [org.apache.mxnet.contrib.clojure/clojure-mxnet "2.0.0-SNAPSHOT"]]
-  :main visualization.core)
diff --git a/contrib/clojure-package/examples/visualization/src/visualization/core.clj b/contrib/clojure-package/examples/visualization/src/visualization/core.clj
deleted file mode 100644
index 31cce9206d81..000000000000
--- a/contrib/clojure-package/examples/visualization/src/visualization/core.clj
+++ /dev/null
@@ -1,46 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns visualization.core
-  (:require [org.apache.clojure-mxnet.symbol :as sym]
-            [org.apache.clojure-mxnet.visualization :as viz]))
-
-(defn get-symbol []
-  (as-> (sym/variable "data") data
-
-    #_(sym/convolution "conv1" {:data data :kernel [3 3] :num-filter 32 :stride [2 2]})
-    #_(sym/batch-norm "bn1" {:data data})
-    #_(sym/activation "relu1" {:data data :act-type "relu"})
-    #_(sym/pooling "mp1" {:data data :kernel [2 2] :pool-type "max" :stride [2 2]}) #_(sym/convolution "conv2" {:data data :kernel [3 3] :num-filter 32 :stride [2 2]})
-    #_(sym/batch-norm "bn2" {:data data})
-    #_(sym/activation "relu2" {:data data :act-type "relu"})
-    #_(sym/pooling "mp2" {:data data :kernel [2 2] :pool-type "max" :stride [2 2]})
-
-    (sym/flatten "fl" {:data data})
-    #_(sym/fully-connected "fc2" {:data data :num-hidden 10})
-    (sym/softmax-output "softmax" {:data data})))
-
-(defn test-viz []
-  (let [dot (viz/plot-network (get-symbol)
-                              {"data" [1 1 28 28]}
-                              {:title "foo" :node-attrs {:shape "oval" :fixedsize "false"}})]
-    (viz/render dot "testviz" "./")))
-
-(defn -main [& args]
-  (do (test-viz)
-      (println "Check for the testviz.pdf file in the project directory")))
-
diff --git a/contrib/clojure-package/examples/visualization/test/visualization/core_test.clj b/contrib/clojure-package/examples/visualization/test/visualization/core_test.clj
deleted file mode 100644
index 1b10695cb34c..000000000000
--- a/contrib/clojure-package/examples/visualization/test/visualization/core_test.clj
+++ /dev/null
@@ -1,28 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns visualization.core_test
- (:require 
- 	[visualization.core :as visualization]
- 	[clojure.test :refer :all]))
-
-(deftest check-pdf 
-	(visualization/test-viz)
-	(let [new-pdf (clojure.java.io/as-file "testviz.pdf")]
-		(is (.exists new-pdf))
-		(is (> 10000 (- (System/currentTimeMillis) (.lastModified new-pdf))))))
-	
\ No newline at end of file
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/conv_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/conv_test.clj
deleted file mode 100644
index ca9d4bc93986..000000000000
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/conv_test.clj
+++ /dev/null
@@ -1,92 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns org.apache.clojure-mxnet.conv-test
-  (:require [clojure.java.io :as io]
-            [clojure.java.shell :refer [sh]]
-            [clojure.test :refer :all]
-            [org.apache.clojure-mxnet.eval-metric :as eval-metric]
-            [org.apache.clojure-mxnet.io :as mx-io]
-            [org.apache.clojure-mxnet.module :as m]
-            [org.apache.clojure-mxnet.optimizer :as optimizer]
-            [org.apache.clojure-mxnet.symbol :as sym]
-            [org.apache.clojure-mxnet.symbol-api :as sym-api]
-            [org.apache.clojure-mxnet.util :as util]
-            [clojure.reflect :as r]))
-
-(def data-dir "data/")
-(def batch-size 100)
-(def num-epoch 1)
-
-(when-not (.exists (io/file (str data-dir "train-images-idx3-ubyte")))
-  (sh "./scripts/get_mnist_data.sh"))
-
-;;; Load the MNIST datasets
-(def train-data (mx-io/mnist-iter {:image (str data-dir "train-images-idx3-ubyte")
-                                   :label (str data-dir "train-labels-idx1-ubyte")
-                                   :label-name "softmax_label"
-                                   :data-shape [1 28 28]
-                                   :label-shape [1 1 10]
-                                   :batch-size batch-size
-                                   :shuffle true
-                                   :flat false
-                                   :silent false
-                                   :seed 10}))
-
-(def test-data (mx-io/mnist-iter {:image (str data-dir "t10k-images-idx3-ubyte")
-                                  :label (str data-dir "t10k-labels-idx1-ubyte")
-                                  :data-shape [1 28 28]
-                                  :batch-size batch-size
-                                  :flat false
-                                  :silent false}))
-(defn get-symbol []
-  (as-> (sym/variable "data") data
-
-    (sym-api/convolution {:name "conv1" :data data :kernel [3 3] :num-filter 32 :stride [2 2]})
-    (sym-api/batch-norm {:name "bn1" :data data})
-    (sym-api/activation {:name "relu1" :data data :act-type "relu"})
-    (sym-api/pooling {:name "mp1" :data data :kernel [2 2] :pool-type "max" :stride [2 2]})
-
-    (sym-api/convolution {:name "conv2" :data data :kernel [3 3] :num-filter 32 :stride [2 2]})
-    (sym-api/batch-norm {:name "bn2" :data data})
-    (sym-api/activation {:name "relu2" :data data :act-type "relu"})
-    (sym-api/pooling {:name "mp2" :data data :kernel [2 2] :pool-type "max" :stride [2 2]})
-
-    (sym-api/flatten {:name "fl" :data data})
-    (sym-api/fully-connected {:name "fc2" :data data :num-hidden 10})
-    (sym-api/softmax-output {:name "softmax" :data data})))
-
-(deftest test-conv []
-  (let [mod (m/module (get-symbol))]
-    ;;; note only one function for training
-    (m/fit mod {:train-data train-data :eval-data test-data :num-epoch num-epoch
-                :fit-params (m/fit-params {:optimizer (optimizer/sgd {:learning-rate 0.1
-                                                                      :momentum 0.9
-                                                                      :wd 0.0001})})})
-
-    ;;high level predict (just a dummy call but it returns a vector of results
-    (m/predict mod {:eval-data test-data})
-
-    ;;;high level score (returs the eval values)
-    (let [score (m/score mod {:eval-data test-data :eval-metric (eval-metric/accuracy)})]
-      (println "Score" score)
-      (is (< 0.92 (last score))))))
-
-(comment
-
-  (require '[clojure.reflect :as r])
-  (r/reflect train-data))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/infer/imageclassifier_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/infer/imageclassifier_test.clj
deleted file mode 100644
index 5890f754033f..000000000000
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/infer/imageclassifier_test.clj
+++ /dev/null
@@ -1,131 +0,0 @@
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns org.apache.clojure-mxnet.infer.imageclassifier-test
-  (:require [org.apache.clojure-mxnet.context :as context]
-            [org.apache.clojure-mxnet.dtype :as dtype]
-            [org.apache.clojure-mxnet.infer :as infer]
-            [org.apache.clojure-mxnet.layout :as layout]
-            [org.apache.clojure-mxnet.ndarray :as ndarray]
-            [clojure.java.io :as io]
-            [clojure.java.shell :refer [sh]]
-            [clojure.test :refer :all]
-            [test-helper]))
-
-(test-helper/load-test-images)
-
-(def model-dir "data/")
-(def model-path-prefix (str model-dir "resnet-18/resnet-18"))
-
-(when-not (.exists (io/file (str model-path-prefix "-symbol.json")))
-  (sh "./scripts/infer/get_resnet_18_data.sh"))
-
-(defn create-classifier []
-  (let [descriptors [{:name "data"
-                      :shape [1 3 224 224]
-                      :layout layout/NCHW
-                      :dtype dtype/FLOAT32}]
-        factory (infer/model-factory model-path-prefix descriptors)]
-    (infer/create-image-classifier factory)))
-
-(deftest test-single-classification
-  (let [classifier (create-classifier)
-        image (infer/load-image-from-file "test/test-images/kitten.jpg")
-        [predictions-all] (infer/classify-image classifier image)
-        [predictions-with-default-dtype] (infer/classify-image classifier image 10)
-        [predictions] (infer/classify-image classifier image 5 dtype/FLOAT32)]
-    (is (= 1000 (count predictions-all)))
-    (is (= 10 (count predictions-with-default-dtype)))
-    (is (= 5 (count predictions)))
-    (is (= "n02123159 tiger cat" (:class (first predictions))))
-    (is (< 0 (:prob (first predictions)) 1))))
-
-(deftest test-batch-classification
-  (let [classifier (create-classifier)
-        image-batch (infer/load-image-paths ["test/test-images/kitten.jpg"
-                                             "test/test-images/Pug-Cookie.jpg"])
-        [batch-predictions-all] (infer/classify-image-batch classifier image-batch)
-        [batch-predictions-with-default-dtype] (infer/classify-image-batch classifier image-batch 10)
-        [predictions] (infer/classify-image-batch classifier image-batch 5 dtype/FLOAT32)]
-    (is (= 1000 (count batch-predictions-all)))
-    (is (= 10 (count batch-predictions-with-default-dtype)))
-    (is (= 5 (count predictions)))
-    (is (= "n02123159 tiger cat" (:class (first predictions))))
-    (is (< 0 (:prob (first predictions)) 1))))
-
-(deftest test-single-classification-with-ndarray
-  (let [classifier (create-classifier)
-        image (-> (infer/load-image-from-file "test/test-images/kitten.jpg")
-                  (infer/reshape-image 224 224)
-                  (infer/buffered-image-to-pixels [3 224 224] dtype/FLOAT32)
-                  (ndarray/expand-dims 0))
-        [predictions-all] (infer/classify-with-ndarray classifier [image])
-        [predictions] (infer/classify-with-ndarray classifier [image] 5)]
-    (is (= 1000 (count predictions-all)))
-    (is (= 5 (count predictions)))
-    (is (= "n02123159 tiger cat" (:class (first predictions))))
-    (is (< 0 (:prob (first predictions)) 1))))
-
-(deftest test-single-classify
-  (let [classifier (create-classifier)
-        image (-> (infer/load-image-from-file "test/test-images/kitten.jpg")
-                  (infer/reshape-image 224 224)
-                  (infer/buffered-image-to-pixels [3 224 224] dtype/FLOAT32)
-                  (ndarray/expand-dims 0))
-        predictions-all (infer/classify classifier [(ndarray/->vec image)])
-        predictions (infer/classify classifier [(ndarray/->vec image)] 5)]
-    (is (= 1000 (count predictions-all)))
-    (is (= 5 (count predictions)))
-    (is (= "n02123159 tiger cat" (:class (first predictions))))
-    (is (< 0 (:prob (first predictions)) 1))))
-
-(deftest test-base-classification-with-ndarray
-  (let [descriptors [{:name "data"
-                      :shape [1 3 224 224]
-                      :layout layout/NCHW
-                      :dtype dtype/FLOAT32}]
-        factory (infer/model-factory model-path-prefix descriptors)
-        classifier (infer/create-classifier factory)
-        image (-> (infer/load-image-from-file "test/test-images/kitten.jpg")
-                  (infer/reshape-image 224 224)
-                  (infer/buffered-image-to-pixels [3 224 224] dtype/FLOAT32)
-                  (ndarray/expand-dims 0))
-        [predictions-all] (infer/classify-with-ndarray classifier [image])
-        [predictions] (infer/classify-with-ndarray classifier [image] 5)]
-    (is (= 1000 (count predictions-all)))
-    (is (= 5 (count predictions)))
-    (is (= "n02123159 tiger cat" (:class (first predictions))))
-    (is (< 0 (:prob (first predictions)) 1))))
-
-(deftest test-base-single-classify
-  (let [descriptors [{:name "data"
-                      :shape [1 3 224 224]
-                      :layout layout/NCHW
-                      :dtype dtype/FLOAT32}]
-        factory (infer/model-factory model-path-prefix descriptors)
-        classifier (infer/create-classifier factory)
-        image (-> (infer/load-image-from-file "test/test-images/kitten.jpg")
-                  (infer/reshape-image 224 224)
-                  (infer/buffered-image-to-pixels [3 224 224] dtype/FLOAT32)
-                  (ndarray/expand-dims 0))
-        predictions-all (infer/classify classifier [(ndarray/->vec image)])
-        predictions (infer/classify classifier [(ndarray/->vec image)] 5)]
-    (is (= 1000 (count predictions-all)))
-    (is (= 5 (count predictions)))
-    (is (= "n02123159 tiger cat" (:class (first predictions))))
-    (is (< 0 (:prob (first predictions)) 1))))
-
-
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/infer/predictor_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/infer/predictor_test.clj
deleted file mode 100644
index e1526be61fbf..000000000000
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/infer/predictor_test.clj
+++ /dev/null
@@ -1,77 +0,0 @@
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns org.apache.clojure-mxnet.infer.predictor-test
-  (:require [org.apache.clojure-mxnet.context :as context]
-            [org.apache.clojure-mxnet.dtype :as dtype]
-            [org.apache.clojure-mxnet.infer :as infer]
-            [org.apache.clojure-mxnet.layout :as layout]
-            [org.apache.clojure-mxnet.ndarray :as ndarray]
-            [org.apache.clojure-mxnet.shape :as shape]
-            [clojure.java.io :as io]
-            [clojure.java.shell :refer [sh]]
-            [clojure.string :refer [split]]
-            [clojure.test :refer :all]
-            [org.apache.clojure-mxnet.util :as util]))
-
-(def model-dir "data/")
-(def model-path-prefix (str model-dir "resnet-18/resnet-18"))
-(def width 224)
-(def height 224)
-
-(when-not (.exists (io/file (str model-path-prefix "-symbol.json")))
-  (sh "./scripts/infer/get_resnet_18_data.sh"))
-
-(defn create-predictor []
-  (let [descriptors [{:name "data"
-                      :shape [1 3 height width]
-                      :layout layout/NCHW
-                      :dtype dtype/FLOAT32}]
-        factory (infer/model-factory model-path-prefix descriptors)]
-    (infer/create-predictor factory)))
-
-(deftest predictor-test-with-ndarray
-  (let [predictor (create-predictor)
-        image-ndarray (-> "test/test-images/kitten.jpg"
-                           infer/load-image-from-file
-                           (infer/reshape-image width height)
-                           (infer/buffered-image-to-pixels [3 width height])
-                           (ndarray/expand-dims 0))
-        predictions (infer/predict-with-ndarray predictor [image-ndarray])
-        synset-file (-> (io/file model-path-prefix)
-                        (.getParent)
-                        (io/file "synset.txt"))
-        synset-names (split (slurp synset-file) #"\n")
-        [best-index] (ndarray/->int-vec (ndarray/argmax (first predictions) 1))
-        best-prediction (synset-names best-index)]
-    (is (= "n02123159 tiger cat" best-prediction))))
-
-(deftest predictor-test
-  (let [predictor (create-predictor)
-        image-ndarray (-> "test/test-images/kitten.jpg"
-                          infer/load-image-from-file
-                          (infer/reshape-image width height)
-                          (infer/buffered-image-to-pixels [3 width height])
-                          (ndarray/expand-dims 0))
-        predictions (infer/predict predictor [(ndarray/->vec image-ndarray)])
-        synset-file (-> (io/file model-path-prefix)
-                        (.getParent)
-                        (io/file "synset.txt"))
-        synset-names (split (slurp synset-file) #"\n")
-        ndarray-preds (ndarray/array (first predictions) [1 1000])
-        [best-index] (ndarray/->int-vec (ndarray/argmax ndarray-preds 1))
-        best-prediction (synset-names best-index)]
-    (is (= "n02123159 tiger cat" best-prediction))))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/module_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/module_test.clj
deleted file mode 100644
index e03c43848332..000000000000
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/module_test.clj
+++ /dev/null
@@ -1,355 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns org.apache.clojure-mxnet.module-test
-  (:require [clojure.java.io :as io]
-            [org.apache.clojure-mxnet.context :as context]
-            [org.apache.clojure-mxnet.dtype :as dtype]
-            [org.apache.clojure-mxnet.io :as mx-io]
-            [org.apache.clojure-mxnet.layout :as layout]
-            [org.apache.clojure-mxnet.module :as m]
-            [org.apache.clojure-mxnet.monitor :as monitor]
-            [org.apache.clojure-mxnet.ndarray :as ndarray]
-            [org.apache.clojure-mxnet.optimizer :as optimizer]
-            [org.apache.clojure-mxnet.shape :as mx-shape]
-            [org.apache.clojure-mxnet.symbol :as sym]
-            [org.apache.clojure-mxnet.util :as util]
-            [clojure.spec.alpha :as s]
-            [clojure.test :refer :all]
-            [clojure.reflect :as r]
-            [clojure.string :as string]))
-
-(deftest test-model-dtype
-  (let [dtype dtype/FLOAT32
-        dshape [3 8 7]
-        s (sym/variable "data")
-        s (sym/activation "act" {"__layout__" "TNC"} {:data s :act_type "relu"})
-
-        mod (m/module s ["data"] nil [(context/cpu 0) (context/cpu 1)])]
-    (-> mod
-        (m/bind {:data-shapes [{:name "data" :shape dshape :dtype dtype :layout "TNC"}]})
-        (m/init-params)
-        (m/forward {:data [(ndarray/ones dshape {:dtype dtype})]})
-        (m/backward [(ndarray/ones dshape {:dtype dtype})]))
-    (let [outputs  (-> mod (m/outputs) flatten)]
-      (is (every? #(= dtype/FLOAT32 (ndarray/dtype %)) outputs)))))
-
-(deftest test-module-input-grads
-  (let [a (sym/variable "a" {:kwargs {"__layout__" "NC"}})
-        b (sym/variable "b" {:kwargs {"__layout__" "NC"}})
-        c (sym/variable "c" {:kwargs {"__layout__" "NC"}})
-        c (sym/+ a (sym/+ (sym/* b 2) (sym/* c 3)))
-        mod (m/module c ["b" "c" "a"] nil [(context/cpu 0) (context/cpu 1)])]
-    (-> mod
-        (m/bind {:data-shapes [{:name "b" :shape [5 5] :layout layout/NT}
-                               {:name "c" :shape [5 5] :layout layout/NT}
-                               {:name "a" :shape [5 5] :layout layout/NT}]
-                 :inputs-need-grad true})
-        (m/init-params)
-        (m/forward {:data [(ndarray/ones [5 5])
-                           (ndarray/ones [5 5])
-                           (ndarray/ones [5 5])]
-                    :label nil
-                    :index nil
-                    :pad 0})
-        (m/backward [(ndarray/ones [5 5])]))
-    (let [[a-grad b-grad c-grad] (m/input-grads-merged mod)]
-      (is (every? #(= 1.0 %) (ndarray/->vec a-grad)))
-      (is (every? #(= 2.0 %) (ndarray/->vec b-grad)))
-      (is (every? #(= 3.0 %) (ndarray/->vec c-grad))))))
-
-(deftest test-module-layout
-  (let [s (sym/variable "data")
-        s (sym/activation "act " {"__layout__" "TNC"} {:data s :act_type "relu"})
-        dshape [3 8 7]
-        mod (m/module s ["data"] nil [(context/cpu 0) (context/cpu 1)])]
-    (-> mod
-        (m/bind {:data-shapes [{:name "data" :shape dshape :dtype dtype/FLOAT32 :layout "TNC"}]})
-        (m/init-params)
-        (m/forward {:data [(ndarray/ones dshape)]
-                    :label nil
-                    :index nil
-                    :pad 0})
-        (m/backward [(ndarray/ones dshape)]))
-    (let [outputs-merged (m/outputs-merged mod)
-          outputs (m/outputs mod)
-          hd-shape [3 4 7]]
-      (is (= dshape (-> outputs-merged first (ndarray/shape) (ndarray/->vec))))
-      (is (every? #(= hd-shape (-> % ndarray/shape ndarray/->vec)) (flatten outputs))))))
-
-(deftest test-module-save-load-single-device
-  (let [s (sym/variable "data")
-        s (sym/fully-connected {:data s :num-hidden 100})
-        ;; single device
-        mod (m/module s {:data-names ["data"] :label-names nil})]
-    (-> mod
-        (m/bind {:data-shapes [{:name "data" :shape [10 10] :layout "NT"}]})
-        (m/init-params)
-        (m/init-optimizer {:optimizer (optimizer/sgd {:learning-rate 0.1 :momentum 0.9})})
-        (m/update)
-        (m/save-checkpoint {:prefix "test" :epoch 0 :save-opt-states true}))
-    (let [mod2 (m/load-checkpoint {:prefix "test" :epoch 0 :load-optimizer-states true})]
-      (-> mod2
-          (m/bind {:data-shapes [{:name "data" :shape [10 10] :layout "NT"}]})
-          (m/init-optimizer {:optimizer (optimizer/sgd {:learning-rate 0.1 :momentum 0.9})}))
-      (is (= (-> mod m/symbol sym/to-json) (-> mod2 m/symbol sym/to-json)))
-      (is (= (-> mod m/params first) (-> mod2 m/params first))))
-    ;; arity 2 version of above. `load-optimizer-states` is `false` here by default,
-    ;; but optimizers states aren't checked here so it's not relevant to the test outcome.
-    (let [mod3 (m/load-checkpoint "test" 0)]
-      (-> mod3
-          (m/bind {:data-shapes [{:name "data" :shape [10 10] :layout "NT"}]})
-          (m/init-optimizer {:optimizer (optimizer/sgd {:learning-rate 0.1 :momentum 0.9})}))
-      (is (= (-> mod m/symbol sym/to-json) (-> mod3 m/symbol sym/to-json)))
-      (is (= (-> mod m/params first) (-> mod3 m/params first))))))
-
-(deftest test-module-save-load-multi-device
-  (let [s (sym/variable "data")
-        s (sym/fully-connected {:data s :num-hidden 100})
-        ;; multi device
-        mod (m/module s {:data-names ["data"] :label-names nil
-                         :contexts [(context/cpu 0) (context/cpu 1)]})]
-    (-> mod
-        (m/bind {:data-shapes [{:name "data" :shape [10 10] :layout "NT"}]})
-        (m/init-params)
-        (m/init-optimizer {:optimizer (optimizer/sgd {:learning-rate 0.1 :momentum 0.9})})
-        (m/update)
-        (m/save-checkpoint {:prefix "test" :epoch 0 :save-opt-states true}))
-
-    (let [mod2 (m/load-checkpoint {:prefix "test" :epoch 0 :load-optimizer-states true})]
-      (-> mod2
-          (m/bind {:data-shapes [{:name "data" :shape [10 10] :layout "NT"}]})
-          (m/init-optimizer {:optimizer (optimizer/sgd {:learning-rate 0.1 :momentum 0.9})}))
-      (is (= (-> mod m/symbol sym/to-json)  (-> mod2 m/symbol sym/to-json)))
-      (is (= (-> mod m/params first) (-> mod2 m/params first))))))
-
-(deftest test-module-reshape
-  (let [s (sym/variable "data")
-        s (sym/fully-connected "fc" {:data s :num-hidden 20})
-        dshape [7 20]
-        mod (m/module s ["data"] nil [(context/cpu 0) (context/cpu 1)])]
-    (-> mod
-        (m/bind {:data-shapes [{:name "data" :shape dshape :layout "NT"}]})
-        (m/init-params)
-        (m/init-optimizer {:optimizer (optimizer/sgd {:learning-rate 1.0})})
-        (m/forward {:data [(ndarray/ones dshape)] :label nil :index nil :pad 0})
-        (m/backward [(ndarray/ones dshape)])
-        (m/update))
-    (is (= dshape (-> (m/outputs-merged mod) first ndarray/shape mx-shape/->vec)))
-    (is (every? #(= -1.0 %) (-> (m/params mod) (first) (get "fc_bias") (ndarray/->vec))))
-
-    (let [dshape [14 20]]
-      (-> mod
-          (m/reshape [{:name "data" :shape dshape :layout "NT"}])
-          (m/forward {:data [(ndarray/ones dshape)] :label nil :index nil :pad 0})
-          (m/backward [(ndarray/ones dshape)])
-          (m/update))
-      (is (= dshape (-> (m/outputs-merged mod) first ndarray/shape mx-shape/->vec)))
-      (is (every? #(< 1e-3 (- 3 %)) (-> mod m/params first (get "fc_bias") (ndarray/->vec)))))))
-
-(deftest test-set-params
-  (let [data (ndarray/array [0.05 0.1] [1 1 1 2])
-        label (ndarray/array [0.01 0.99] [1 1 1 2])
-        train-data (mx-io/ndarray-iter [data] {:label [label] :label-name "softmax_label"})
-        x (as-> (sym/variable "data") v
-            (sym/fully-connected "fc_0" {:data v :num-hidden 2})
-            (sym/activation "act_0" {:data v :act-type "sigmoid"})
-            (sym/fully-connected "fc_1" {:data v :num-hidden 2})
-            (sym/activation "act_1" {:data v :act-type "sigmoid"})
-            (sym/linear-regression-output "softmax" {:data v :grad-scale 2}))
-
-        mod (m/module x)]
-    (m/bind mod {:data-shapes (mx-io/provide-data-desc train-data) :label-shapes (mx-io/provide-label train-data)})
-
-    (let [arg-params-correct {"fc_0_weight" (ndarray/array [0.15 0.2 0.25 0.3] [2 2])
-                              "fc_0_bias" (ndarray/array [0.35 0.35] [2])
-                              "fc_1_weight" (ndarray/array [0.4 0.45 05 0.55] [2 2])
-                              "fc_1_bias" (ndarray/array [0.6 0.6] [2])}
-          arg-params-missing {"fc_0_weight" (ndarray/array [0.15 0.2 0.25 0.3] [2 2])
-                              "fc_0_bias" (ndarray/array [0.35 0.35] [2])
-                              "fc_1_weight" (ndarray/array [0.4 0.45 05 0.55] [2 2])}
-          arg-params-extra {"fc_0_weight" (ndarray/array [0.15 0.2 0.25 0.3] [2 2])
-                            "fc_0_bias" (ndarray/array [0.35 0.35] [2])
-                            "fc_1_weight" (ndarray/array [0.4 0.45 05 0.55] [2 2])
-                            "fc_1_bias" (ndarray/array [0.6 0.6] [2])
-                            "fc_2_weight" (ndarray/array [0.6 0.6] [2])}]
-      (m/set-params mod {:arg-params arg-params-correct :force-init true})
-      (m/set-params mod {:arg-params arg-params-missing :allow-missing true})
-      (m/set-params mod {:arg-params arg-params-extra :allow-extra true}))))
-
-(deftest test-monitor
-  (let [data (ndarray/array [0.05 0.1] [1 1 1 2])
-        label (ndarray/array [0.01 0.99] [1 1 1 2])
-        train-data (mx-io/ndarray-iter [data] {:label [label] :label-name "softmax_label"})
-        x (as-> (sym/variable "data") v
-            (sym/fully-connected "fc_0" {:data v :num-hidden 2})
-            (sym/activation "act_0" {:data v :act-type "sigmoid"})
-            (sym/fully-connected "fc_1" {:data v :num-hidden 2})
-            (sym/activation "act_1" {:data v :act-type "sigmoid"})
-            (sym/linear-regression-output "softmax" {:data v :grad-scale 2}))
-        ;; create monitor
-        mon (monitor/monitor 1 (fn [x]
-                                 (ndarray/div (ndarray/sum (ndarray/abs x))
-                                              (mx-shape/product (ndarray/shape x)))))
-        mod (m/module x {:contexts [(context/cpu 0)]})
-        arg-params {"fc_0_weight" (ndarray/array [0.15 0.2 0.25 0.3] [2 2])
-                    "fc_0_bias" (ndarray/array [0.35 0.35] [2])
-                    "fc_1_weight" (ndarray/array [0.4 0.45 05 0.55] [2 2])
-                    "fc_1_bias" (ndarray/array [0.6 0.6] [2])}
-        data-batch (mx-io/next train-data)]
-    (-> mod
-        (m/bind {:data-shapes [{:name "data", :shape [1 1 1 2]}]
-                 :label-shapes [{:name "softmax_label", :shape [1 1 1 2]}]})
-        (m/install-monitor mon)
-        (m/init-params {:arg-params arg-params}))
-    (monitor/tic mon)
-    (m/forward-backward mod data-batch)
-    (let [result (monitor/toc mon)
-          freq (->> result
-                    (map (fn [v] (as-> (second v) ?
-                                   (clojure.string/split ? #"_")
-                                   (take 2 ?)
-                                   (clojure.string/join "_" ?))))
-                    (frequencies))
-          expected-freq {"act_0" 2 "act_1" 2 "data" 1 "fc_0" 6 "fc_1" 6}]
-      (is (= expected-freq (select-keys freq (keys expected-freq)))))))
-
-(deftest test-forward-reshape
-  (let [num-class 10
-        data1 (sym/variable "data1")
-        data2 (sym/variable "data2")
-        conv1 (sym/convolution {:data data1 :kernel [2 2] :num-filter 2 :stride [2 2]})
-        conv2 (sym/convolution {:data data2 :kernel [3 3] :num-filter 3 :stride [1 1]})
-        pooling1 (sym/pooling {:data conv1 :kernel [2 2] :pool-type "avg" :stride [1 1]})
-        pooling2 (sym/pooling {:data conv2 :kernel [2 2] :pool-type "max" :stride [1 1]})
-        flatten1 (sym/flatten {:data pooling1})
-        flatten2 (sym/flatten {:data pooling2})
-        sum (sym/+ (sym/sum {:data flatten1 :axis 1})
-                   (sym/sum {:data flatten2 :axis 1}))
-        fc (sym/fully-connected {:data sum :num-hidden num-class})
-        my-sym (sym/softmax-output "softmax" {:data fc})
-
-        d-shape1 [10 3 64 64]
-        d-shape2 [10 3 32 32]
-        l-shape [10]
-
-        mod (m/module my-sym {:data-names ["data1" "data2"]})
-        data-batch {:data [(ndarray/random-uniform 0 9 (str (mx-shape/->shape d-shape1)))
-                           (ndarray/random-uniform 5 15 (str (mx-shape/->shape d-shape2)))]
-                    :label [(ndarray/ones l-shape)]
-                    :index nil
-                    :pad 0}]
-
-   ;; train with the original shapes
-    (-> mod
-        (m/bind {:data-shapes [{:name "data1" :shape d-shape1}
-                               {:name "data2" :shape d-shape2}]
-                 :label-shapes [{:name "softmax_label" :shape l-shape :layout "N"}]})
-        (m/init-params)
-        (m/init-optimizer {:optimizer (optimizer/sgd {:learning-rate 0.1})})
-        (m/forward data-batch))
-    (is (= [(first l-shape) num-class]
-           (-> mod
-               (m/outputs-merged)
-               (first)
-               (ndarray/shape)
-               (mx-shape/->vec))))
-    (-> mod
-        (m/backward)
-        (m/update))
-
-    (let [d-shape1 [3 3 64 64]
-          d-shape2 [3 3 32 32]
-          l-shape [3]
-          data-batch-2 {:data [(ndarray/random-uniform 0 9 (str (mx-shape/->shape d-shape1)))
-                               (ndarray/random-uniform 5 15 (str (mx-shape/->shape d-shape2)))]
-                        :label [(ndarray/ones l-shape)]
-                        :index nil
-                        :pad 0}]
-      (-> mod
-          (m/forward data-batch-2))
-      (is (= [(first l-shape) num-class]
-             (-> mod
-                 (m/outputs-merged)
-                 (first)
-                 (ndarray/shape)
-                 (mx-shape/->vec))))
-      (-> mod
-          (m/backward)
-          (m/update)))
-
-    (let [d-shape1 [20 3 64 64]
-          d-shape2 [20 3 32 32]
-          l-shape [20]
-          data-batch-2 {:data [(ndarray/random-uniform 3 5 (str (mx-shape/->shape d-shape1)))
-                               (ndarray/random-uniform 10 25 (str (mx-shape/->shape d-shape2)))]
-                        :label [(ndarray/ones l-shape)]
-                        :index nil
-                        :pad 0}]
-      (-> mod
-          (m/forward data-batch-2))
-      (is (= [(first l-shape) num-class]
-             (-> mod
-                 (m/outputs-merged)
-                 (first)
-                 (ndarray/shape)
-                 (mx-shape/->vec))))
-      (-> mod
-          (m/backward)
-          (m/update)))
-
-    ;; train with both different batch sizes and data shapes
-    (let [d-shape1 [20 3 120 120]
-          d-shape2 [20 3 32 64]
-          l-shape [20]
-          data-batch {:data [(ndarray/random-uniform 0 9 (str (mx-shape/->shape d-shape1)))
-                             (ndarray/random-uniform 15 25 (str (mx-shape/->shape d-shape2)))]
-                      :label [(ndarray/ones l-shape)]
-                      :index nil
-                      :pad 0}]
-      (-> mod
-          (m/forward data-batch))
-      (is (= [(first l-shape) num-class]
-             (-> (m/outputs-merged mod)
-                 first
-                 (ndarray/shape)
-                 (mx-shape/->vec))))
-      (-> mod
-          (m/backward)
-          (m/update)))
-    (let [d-shape1 [5 3 28 40]
-          d-shape2 [5 3 24 16]
-          l-shape [5]
-          data-batch {:data [(ndarray/random-uniform 0 9 (str (mx-shape/->shape d-shape1)))
-                             (ndarray/random-uniform 15 25 (str (mx-shape/->shape d-shape2)))]
-                      :label [(ndarray/ones l-shape)]
-                      :index nil
-                      :pad 0}]
-      (-> mod
-          (m/forward data-batch))
-      (is (= [(first l-shape) num-class]
-             (-> (m/outputs-merged mod)
-                 first
-                 (ndarray/shape)
-                 (mx-shape/->vec))))
-      (-> mod
-          (m/backward)
-          (m/update)))))
-
-(comment
-
-  (m/data-shapes x))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/ndarray_api_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/ndarray_api_test.clj
deleted file mode 100644
index 18b8b78f19d1..000000000000
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/ndarray_api_test.clj
+++ /dev/null
@@ -1,415 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns org.apache.clojure-mxnet.ndarray-api-test
-  (:require [org.apache.clojure-mxnet.base :as base]
-            [org.apache.clojure-mxnet.context :as ctx]
-            [org.apache.clojure-mxnet.dtype :as dtype]
-            [org.apache.clojure-mxnet.ndarray :as ndarray :refer [->vec zeros ones += -= *= full shape shape-vec]]
-            [org.apache.clojure-mxnet.ndarray-api :as ndarray-api]
-            [org.apache.clojure-mxnet.shape :as mx-shape :refer [->shape]]
-            [org.apache.clojure-mxnet.test-util :as test-util :refer [approx=]]
-            [org.apache.clojure-mxnet.util :as util :refer [->option]]
-            [clojure.test :refer :all]))
-
-(deftest test-activation
-  (let [data (ndarray/array [2 1 0 -1 -2] [1 5])
-        relu (ndarray-api/activation data "relu")
-        sigmoid (ndarray-api/activation data "sigmoid")
-        softsign (ndarray-api/activation data "softsign")
-        out (ndarray/zeros [1 5])
-        _ (ndarray-api/activation {:data data :act-type "relu" :out out})]
-    (is (= [2.0 1.0 0.0 0.0 0.0] (->vec relu)))
-    (is (approx= 1e-3 [0.881 0.731 0.5 0.269 0.119] (->vec sigmoid)))
-    (is (approx= 1e-3 [0.666 0.5 0.0 -0.5 -0.666] (->vec softsign)))
-    (is (= [2.0 1.0 0.0 0.0 0.0] (->vec out)))))
-
-(deftest test-bilinear-sampler
-  (let [data (ndarray/array [1 4 3 6
-                             1 8 8 9
-                             0 4 1 5
-                             1 0 1 3]
-                            [1 1 4 4])
-        affine (ndarray/array [2 0 0
-                               0 2 0]
-                              [1 6])
-        grid (ndarray-api/grid-generator {:data affine :transform-type "affine" :target-shape [4 4]})
-        out (ndarray-api/bilinear-sampler data grid)]
-    (is (approx= 1e-3
-                 [0.0 0.0 0.0 0.0
-                  0.0 3.5 6.5 0.0
-                  0.0 1.25 2.5 0.0
-                  0.0 0.0 0.0 0.0]
-                 (->vec out)))))
-
-(deftest test-cast
-  (let [nda1 (ndarray/array [0.9 1.3] [2])
-        nda2 (ndarray/array [1e20 11.1] [2])
-        nda3 (ndarray/array [300 11.1 10.9 -1 -3] [5])
-        out (ndarray/zeros [2] {:dtype dtype/INT32})
-        _ (ndarray-api/cast {:data nda1 :dtype (str dtype/INT32) :out out})]
-    (is (= [0.0 1.0] (->vec (ndarray-api/cast nda1 (str dtype/INT32)))))
-    (is (= [(float 1e20) (float 11.1)] (->vec (ndarray-api/cast nda2 (str dtype/FLOAT32)))))
-    ;; uint8 gets converted to native types after ->vec
-    (is (= [44.0 11.0 10.0 -1.0 -3.0] (->vec (ndarray-api/cast nda3 "uint8"))))))
-
-(deftest test-concat
-  (let [nda1 (ndarray/zeros [1 2])
-        nda2 (ndarray/ones [1 2])
-        out (ndarray/zeros [1 4])
-        res1 (ndarray-api/concat [nda1 nda2] 2) ;; num_args=2, dim=1 (default)
-        res2 (ndarray-api/concat {:data [nda1 nda2] :num-args 2 :dim 0}) ;; num_args=2, dim=0
-        res3 (ndarray-api/concat {:data [nda1 nda2 nda1] :num-args 3 :dim 1}) ;; num_args=3, dim=1
-        _ (ndarray-api/concat {:data [nda1 nda2] :num-args 2 :dim 1 :out out}) ;; store result in out
-        ]
-    (is (= [0.0 0.0 1.0 1.0] (->vec res1)))
-    (is (= [1 4] (shape-vec res1)))
-    (is (= [0.0 0.0 1.0 1.0] (->vec res2)))
-    (is (= [2 2] (shape-vec res2)))
-    (is (= [0.0 0.0 1.0 1.0 0.0 0.0] (->vec res3)))
-    (is (= [1 6] (shape-vec res3)))
-    (is (= [0.0 0.0 1.0 1.0] (->vec out)))
-    (is (= [1 4] (shape-vec out)))))
-
-(deftest test-embedding
-  (let [input-dim 4
-        output-dim 5
-        w (ndarray/array [0.  1.  2.  3.  4.
-                          5.  6.  7.  8.  9.
-                          10. 11. 12. 13. 14.
-                          15. 16. 17. 18. 19.]
-                         [4 5])
-        x (ndarray/array [1. 3.
-                          0. 2.]
-                         [2 2])
-        out (ndarray-api/embedding x w input-dim output-dim)]
-    (is (= [5.  6.  7.  8.  9.
-            15. 16. 17. 18. 19.
-            0.  1.  2.  3.  4.
-            10. 11. 12. 13. 14.]
-           (->vec out)))
-    (is (= [2 2 5] (shape-vec out)))))
-
-(deftest test-flatten
-  (let [nda (ndarray/array [1 2 3
-                            4 5 6
-                            7 8 9
-                            1 2 3
-                            4 5 6
-                            7 8 9]
-                           [2 3 3])
-        out (ndarray/zeros [2 9])
-        res (ndarray-api/flatten {:data nda})
-        _ (ndarray-api/flatten {:data nda :out out})]
-    (is (= [1. 2. 3. 4. 5. 6. 7. 8. 9.
-            1. 2. 3. 4. 5. 6. 7. 8. 9.] (->vec res)))
-    (is (= [2 9] (shape-vec res)))
-    (is (= [1. 2. 3. 4. 5. 6. 7. 8. 9.
-            1. 2. 3. 4. 5. 6. 7. 8. 9.] (->vec out)))
-    (is (= [2 9] (shape-vec out)))))
-
-(deftest test-instance-norm
-  (let [x (ndarray/array [1.1 2.2 3.3 4.4] [2 1 2])
-        gamma (ndarray/array [1.5] [1])
-        beta (ndarray/array [0.5] [1])
-        res (ndarray-api/instance-norm x gamma beta)]
-    (is (approx= 1e-4 [-0.9975 1.9975
-                       -0.9975 1.9975] (->vec res)))
-    (is (= [2 1 2] (shape-vec res)))))
-
-(deftest test-l2-normalization
-  (let [x (ndarray/array [1 2 3 4 2 2 5 6] [2 2 2])
-        res1 (ndarray-api/l2-normalization {:data x}) ;; instance-wise
-        res2 (ndarray-api/l2-normalization {:data x :mode "instance"})
-        res3 (ndarray-api/l2-normalization {:data x :mode "channel"})
-        res4 (ndarray-api/l2-normalization {:data x :mode "spatial"})]
-    (is (approx= 1e-4 [0.1825 0.3651
-                       0.5477 0.7303
-                       0.2407 0.2407
-                       0.6019 0.7223] (->vec res1)))
-    (is (approx= 1e-4 [0.1825 0.3651
-                       0.5477 0.7303
-                       0.2407 0.2407
-                       0.6019 0.7223] (->vec res2)))
-    (is (approx= 1e-4 [0.3162 0.4472
-                       0.9486 0.8944
-                       0.3714 0.3162
-                       0.9284 0.9486] (->vec res3)))
-    (is (approx= 1e-4 [0.4472 0.8944
-                       0.6    0.8
-                       0.7071 0.7071
-                       0.6402 0.7682] (->vec res4)))))
-
-(deftest test-pad
-  (let [x (ndarray/array [1 2 3
-                          4 5 6
-                          7 8 9
-                          10 11 12
-                          11 12 13
-                          14 15 16
-                          17 18 19
-                          20 21 22]
-                         [2 2 2 3])
-        res1 (ndarray-api/pad x "edge" [0,0,0,0,1,1,1,1])
-        res2 (ndarray-api/pad {:data x :mode "constant" :pad-width [0,0,0,0,1,1,1,1] :constant-value 0})]
-    (is (= [1.   1.   2.   3.   3.
-            1.   1.   2.   3.   3.
-            4.   4.   5.   6.   6.
-            4.   4.   5.   6.   6.
-            7.   7.   8.   9.   9.
-            7.   7.   8.   9.   9.
-            10.  10.  11.  12.  12.
-            10.  10.  11.  12.  12.
-            11.  11.  12.  13.  13.
-            11.  11.  12.  13.  13.
-            14.  14.  15.  16.  16.
-            14.  14.  15.  16.  16.
-            17.  17.  18.  19.  19.
-            17.  17.  18.  19.  19.
-            20.  20.  21.  22.  22.
-            20.  20.  21.  22.  22.] (->vec res1)))
-    (is (= [2 2 4 5] (shape-vec res1)))
-    (is (= [0.   0.   0.   0.   0.
-            0.   1.   2.   3.   0.
-            0.   4.   5.   6.   0.
-            0.   0.   0.   0.   0.
-            
-            0.   0.   0.   0.   0.
-            0.   7.   8.   9.   0.
-            0.  10.  11.  12.   0.
-            0.   0.   0.   0.   0.
-            
-            0.   0.   0.   0.   0.
-            0.  11.  12.  13.   0.
-            0.  14.  15.  16.   0.
-            0.   0.   0.   0.   0.
-            
-            0.   0.   0.   0.   0.
-            0.  17.  18.  19.   0.
-            0.  20.  21.  22.   0.
-            0.   0.   0.   0.   0.] (->vec res2)))
-    (is (= [2 2 4 5] (shape-vec res2)))))
-
-(deftest test-roi-pooling
-  (let [xi [[[[  0.,   1.,   2.,   3.,   4.,   5.],
-              [  6.,   7.,   8.,   9.,  10.,  11.],
-              [ 12.,  13.,  14.,  15.,  16.,  17.],
-              [ 18.,  19.,  20.,  21.,  22.,  23.],
-              [ 24.,  25.,  26.,  27.,  28.,  29.],
-              [ 30.,  31.,  32.,  33.,  34.,  35.],
-              [ 36.,  37.,  38.,  39.,  40.,  41.],
-              [ 42.,  43.,  44.,  45.,  46.,  47.]]]]
-        x (ndarray/array (-> xi flatten vec) [1 1 8 6])
-        y (ndarray/array [0 0 0 4 4] [1 5])
-        res1 (ndarray-api/roi-pooling x y [2 2] 1.0)
-        res2 (ndarray-api/roi-pooling x y [2 2] 0.7)]
-    (is (= [14. 16. 26. 28.] (->vec res1)))
-    (is (= [1 1 2 2] (shape-vec res1)))
-    (is (= [7. 9. 19. 21.] (->vec res2)))
-    (is (= [1 1 2 2] (shape-vec res2)))))
-
-(deftest test-reshape
-  (let [x (ndarray/array (vec (range 4)) [4])
-        y (ndarray/array (vec (range 24)) [2 3 4])
-        z (ndarray/array (vec (range 120)) [2 3 4 5])
-        res1 (ndarray-api/reshape {:data x :shape [2 2]})]
-    (is (= [0. 1. 2. 3.] (->vec res1)))
-    (is (= [2 2] (shape-vec res1)))
-    (is (= (map float (range 24)) (->vec (ndarray-api/reshape {:data y :shape [4 0 2]}))))
-    (is (= [4 3 2] (shape-vec (ndarray-api/reshape {:data y :shape [4 0 2]}))))
-    (is (= [2 3 4] (shape-vec (ndarray-api/reshape {:data y :shape [2 0 0]}))))
-    (is (= [6 1 4] (shape-vec (ndarray-api/reshape {:data y :shape [6 1 -1]}))))
-    (is (= [3 1 8] (shape-vec (ndarray-api/reshape {:data y :shape [3 -1 8]}))))
-    (is (= [24] (shape-vec (ndarray-api/reshape {:data y :shape [-1]}))))
-    (is (= [2 3 4] (shape-vec (ndarray-api/reshape {:data y :shape [-2]}))))
-    (is (= [2 3 4] (shape-vec (ndarray-api/reshape {:data y :shape [2 -2]}))))
-    (is (= [2 3 4 1 1] (shape-vec (ndarray-api/reshape {:data y :shape [-2 1 1]}))))
-    (is (= [6 4] (shape-vec (ndarray-api/reshape {:data y :shape [-3 4]}))))
-    (is (= [6 20] (shape-vec (ndarray-api/reshape {:data z :shape [-3 -3]}))))
-    (is (= [2 12] (shape-vec (ndarray-api/reshape {:data y :shape [0 -3]}))))
-    (is (= [6 4] (shape-vec (ndarray-api/reshape {:data y :shape [-3 -2]}))))
-    (is (= [1 2 3 4] (shape-vec (ndarray-api/reshape {:data y :shape [-4 1 2 -2]}))))
-    (is (= [2 1 3 4] (shape-vec (ndarray-api/reshape {:data y :shape [2 -4 -1 3 -2]}))))))
-
-(deftest test-sequence-last
-  (let [xi [[[  1.,   2.,   3.],
-             [  4.,   5.,   6.],
-             [  7.,   8.,   9.]],
-            
-            [[ 10.,   11.,   12.],
-             [ 13.,   14.,   15.],
-             [ 16.,   17.,   18.]],
-            
-            [[  19.,   20.,   21.],
-             [  22.,   23.,   24.],
-             [  25.,   26.,   27.]]]
-        x (ndarray/array (-> xi flatten vec) [3 3 3])
-        seq-len1 (ndarray/array [1 1 1] [3])
-        seq-len2 (ndarray/array [1 2 3] [3])
-        ;; This test is failing with an exception
-        ;; (most likely a scala generation issue)
-        ;; res1 (ndarray-api/sequence-last x nil)
-        ]
-    ;; (is (= [] (->vec res1)))
-))
-
-(deftest test-sequence-mask
-  (let [xi [[[  1.,   2.,   3.],
-             [  4.,   5.,   6.]],
-            
-            [[  7.,   8.,   9.],
-             [ 10.,  11.,  12.]],
-            
-            [[ 13.,  14.,   15.],
-             [ 16.,  17.,   18.]]]
-        x (ndarray/array (-> xi flatten vec) [3 2 3])
-        seq-len1 (ndarray/array [1 1] [2])
-        seq-len2 (ndarray/array [2 3] [2])
-        ;; Same issue as previous test
-        ;; res1 (ndarray-api/sequence-mask x seq-len1)
-        ]
-    ;; (is (= [] (->vec res1)))
-))
-
-(deftest test-slice-channel
-  (let [xi [[[ 1.] [ 2.]]
-            [[ 3.] [ 4.]]
-            [[ 5.] [ 6.]]]
-        x (ndarray/array (-> xi flatten vec) [3 2 1])
-        res1 (ndarray-api/slice-channel {:data x :num-outputs 2 :axis 1})
-        res2 (ndarray-api/slice-channel {:data x :num-outputs 3 :axis 0})
-        res3 (ndarray-api/slice-channel {:data x :num-outputs 3 :axis 0 :squeeze-axis 1})]
-    (is (= [1. 3. 5.] (->vec res1)))
-    (is (= [3 1 1] (shape-vec res1)))
-    (is (= [1. 2.] (->vec res2)))
-    (is (= [1 2 1] (shape-vec res2)))
-    (is (= [1. 2.] (->vec res3)))
-    (is (= [2 1] (shape-vec res3)))))
-
-(deftest test-softmax-activation
-  (let [x (ndarray/array [1 1 1 1 1 1] [2 3])
-        res1 (ndarray-api/softmax-activation {:data x :mode "instance"})]
-    (is (approx= 1e-3 [0.333 0.333 0.333
-                       0.333 0.333 0.333] (->vec res1)))
-    (is (= [2 3] (shape-vec res1)))))
-
-(deftest test-softmax-output
-  (let [datai [[1,2,3,4],[2,2,2,2],[3,3,3,3],[4,4,4,4]]
-        data (ndarray/array (-> datai flatten vec) [4 4])
-        label (ndarray/array [1,0,2,3] [4])
-        res1 (ndarray-api/softmax-output data label)]
-    (is (approx= 1e-4 [0.0321 0.0871 0.2369 0.6439
-                       0.25 0.25 0.25 0.25
-                       0.25 0.25 0.25 0.25
-                       0.25 0.25 0.25 0.25] (->vec res1)))
-    (is (= [4 4] (shape-vec res1)))))
-
-(deftest test-swap-axis
-  (let [x (ndarray/array (range 3) [1 3])
-        y (ndarray/array (range 8) [2 2 2])
-        res1 (ndarray-api/swap-axis {:data x :dim1 0 :dim2 1})
-        res2 (ndarray-api/swap-axis {:data y :dim1 0 :dim2 2})]
-    (is (= [0. 1. 2.] (->vec res1)))
-    (is (= [3 1] (shape-vec res1)))
-    (is (= [0. 4. 2. 6. 1. 5. 3. 7.] (->vec res2)))
-    (is (= [2 2 2] (shape-vec res2)))))
-
-(deftest test-abs
-  (let [x (ndarray/array [-2 0 3] [3])
-        res1 (ndarray-api/abs {:data x})]
-    (is (= [2. 0. 3.] (->vec res1)))
-    (is (= [3] (shape-vec res1)))))
-
-(deftest test-arccos
-  (let [x (ndarray/array [-1 -0.707 0 0.707 1] [5])
-        pi Math/PI
-        res1 (ndarray-api/arccos {:data x})]
-    (is (approx= 1e-3 [pi (* 0.75 pi) (* 0.5 pi) (* 0.25 pi) 0.] (->vec res1)))))
-
-(deftest test-arcsin
-  (let [x (ndarray/array [-1 -0.707 0 0.707 1] [5])
-        pi Math/PI
-        res1 (ndarray-api/arcsin {:data x})]
-    (is (approx= 1e-3 [(- (* 0.5 pi)) (- (* 0.25 pi)) 0 (* 0.25 pi) (* 0.5 pi)] (->vec res1)))))
-
-(deftest test-argmax
-  (let [x (ndarray/array (range 6) [2 3])
-        res1 (ndarray-api/argmax {:data x :axis 0})
-        res2 (ndarray-api/argmax {:data x :axis 1})
-        res3 (ndarray-api/argmax {:data x :axis 0 :keepdims true})
-        res4 (ndarray-api/argmax {:data x :axis 1 :keepdims true})]
-    (is (= [1. 1. 1.] (->vec res1)))
-    (is (= [3] (shape-vec res1)))
-    (is (= [2. 2.] (->vec res2)))
-    (is (= [2] (shape-vec res2)))
-    (is (= [1. 1. 1.] (->vec res3)))
-    (is (= [1 3] (shape-vec res3)))
-    (is (= [2. 2.] (->vec res4)))
-    (is (= [2 1] (shape-vec res4)))))
-
-(deftest test-argmax-channel
-  (let [x (ndarray/array (range 6) [2 3])
-        res1 (ndarray-api/argmax-channel {:data x})]
-    (is (= [2. 2.] (->vec res1)))
-    (is (= [2] (shape-vec res1)))))
-
-(deftest test-argmin
-  (let [x (ndarray/array (reverse (range 6)) [2 3])
-        res1 (ndarray-api/argmin {:data x :axis 0})
-        res2 (ndarray-api/argmin {:data x :axis 1})
-        res3 (ndarray-api/argmin {:data x :axis 0 :keepdims true})
-        res4 (ndarray-api/argmin {:data x :axis 1 :keepdims true})]
-    (is (= [1. 1. 1.] (->vec res1)))
-    (is (= [3] (shape-vec res1)))
-    (is (= [2. 2.] (->vec res2)))
-    (is (= [2] (shape-vec res2)))
-    (is (= [1. 1. 1.] (->vec res3)))
-    (is (= [1 3] (shape-vec res3)))
-    (is (= [2. 2.] (->vec res4)))
-    (is (= [2 1] (shape-vec res4)))))
-
-(deftest test-argsort
-  (let [x (ndarray/array [0.3  0.2  0.4
-                          0.1  0.3  0.2]
-                         [2 3])
-        y (ndarray/array [0.3 0.2 0.4 0.1 0.3 0.2] [6])
-        res1 (ndarray-api/argsort {:data x})
-        res2 (ndarray-api/argsort {:data x :axis 0})
-        res3 (ndarray-api/argsort {:data y})]
-    (is (= [1. 0. 2.
-            0. 2. 1.]
-           (->vec res1)))
-    (is (= [2 3] (shape-vec res1)))
-    (is (= [1. 0. 1.
-            0. 1. 0.]
-           (->vec res2)))
-    (is (= [2 3] (shape-vec res1)))
-    (is (= [3. 1. 5. 0. 4. 2.] (->vec res3)))
-    (is (= [6] (shape-vec res3)))))
-
-(deftest test-batch-take
-  (let [x (ndarray/array (range 6) [3 2])
-        i (ndarray/as-type (ndarray/array [0 1 0] [3]) dtype/INT32)
-        res1 (ndarray-api/batch-take x i)        ]
-    (is (= [0. 3. 4.] (->vec res1)))))
-
-(deftest test-broadcast-add
-  (let [x (ndarray/ones [2 3])
-        y (ndarray/array (range 2) [2 1])
-        res1 (ndarray-api/broadcast-add x y)]
-    (is (= [1. 1. 1. 2. 2. 2.] (->vec res1)))
-    (is (= [2 3] (shape-vec res1)))))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/operator_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/operator_test.clj
index c4edb19d7e3d..2498e3027bcf 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/operator_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/operator_test.clj
@@ -80,14 +80,6 @@
                            np-out (ndarray/->vec arr-label))]
       (is (approx= 1e-6 npout-back arr-grad)))))
 
-(deftest test-regression
-  (check-regression (sym/logistic-regression-output {:data (sym/variable "data") :label (sym/variable "label")})
-                    (fn [x] (/ 1.0 (+ 1.0 (Math/exp (* -1.0 x)))))
-                    (fn [x y] (- x y)))
-  (check-regression (sym/linear-regression-output {:data (sym/variable "data") :label (sym/variable "label")})
-                    (fn [x] x)
-                    (fn [x y] (- x y))))
-
 (deftest swap-axes
   (let [data (sym/variable "data")
         shape-vec [2 3 4]
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_api_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_api_test.clj
index b642ad75d1d0..03bf7c31b30c 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_api_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_api_test.clj
@@ -49,13 +49,3 @@
     (= (sym/list-arguments oldfc) (-> (sym/get-internals net1)
                                       (sym/get "fc1_output")
                                       (sym/list-arguments)))))
-
-(deftest test-infer-type
-  (let [data (sym/variable "data")
-        f32data (sym-api/cast {:data data :dtype "float32"})
-        fc1 (sym-api/fully-connected {:data f32data :num-hidden 128 :name"fc1"})
-        mlp (sym-api/softmax-output {:data fc1 :name"softmax"})
-        [arg out aux] (sym/infer-type mlp {:data dtype/FLOAT64})]
-    (is (= [dtype/FLOAT64 dtype/FLOAT32 dtype/FLOAT32 dtype/FLOAT32] (util/buffer->vec arg)))
-    (is (= [dtype/FLOAT32] (util/buffer->vec out)))
-    (is (= [] (util/buffer->vec aux)))))
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_test.clj
index 4d1b493ab2b6..5308b883aa3c 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/symbol_test.clj
@@ -50,16 +50,6 @@
                                       (sym/get "fc1_output")
                                       (sym/list-arguments)))))
 
-(deftest test-infer-type
-  (let [data (sym/variable "data")
-        f32data (sym/cast {:data data :dtype "float32"})
-        fc1 (sym/fully-connected "fc1" {:data f32data :num-hidden 128})
-        mlp (sym/softmax-output "softmax" {:data fc1})
-        [arg out aux] (sym/infer-type mlp {:data dtype/FLOAT64})]
-    (is (= [dtype/FLOAT64 dtype/FLOAT32 dtype/FLOAT32 dtype/FLOAT32] (util/buffer->vec arg)))
-    (is (= [dtype/FLOAT32] (util/buffer->vec out)))
-    (is (= [] (util/buffer->vec aux)))))
-
 (deftest test-copy
   (let [data (sym/variable "data")
         data2 (sym/clone data)]
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/visualization_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/visualization_test.clj
deleted file mode 100644
index a2bea9478390..000000000000
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/visualization_test.clj
+++ /dev/null
@@ -1,32 +0,0 @@
-;;
-;; Licensed to the Apache Software Foundation (ASF) under one or more
-;; contributor license agreements.  See the NOTICE file distributed with
-;; this work for additional information regarding copyright ownership.
-;; The ASF licenses this file to You under the Apache License, Version 2.0
-;; (the "License"); you may not use this file except in compliance with
-;; the License.  You may obtain a copy of the License at
-;;
-;;    http://www.apache.org/licenses/LICENSE-2.0
-;;
-;; Unless required by applicable law or agreed to in writing, software
-;; distributed under the License is distributed on an "AS IS" BASIS,
-;; WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-;; See the License for the specific language governing permissions and
-;; limitations under the License.
-;;
-
-(ns org.apache.clojure-mxnet.visualization-test
-  (:require [org.apache.clojure-mxnet.symbol :as sym]
-            [org.apache.clojure-mxnet.visualization :as viz]
-            [clojure.test :refer :all])
-  (:import (org.apache.mxnet Visualization$Dot)))
-
-(deftest test-plot-network
-  (let [to-plot-sym (as-> (sym/variable "data") data
-                      (sym/flatten "fl" {:data data})
-                      (sym/softmax-output "softmax" {:data data}))
-        dot (viz/plot-network to-plot-sym
-                              {"data" [1 1 28 28]}
-                              {:title "foo"
-                               :node-attrs {:shape "oval" :fixedsize "false"}})]
-    (is (instance? Visualization$Dot dot))))
diff --git a/cpp-package/example/CMakeLists.txt b/cpp-package/example/CMakeLists.txt
index bf9427af03ee..d54843f319b4 100644
--- a/cpp-package/example/CMakeLists.txt
+++ b/cpp-package/example/CMakeLists.txt
@@ -18,40 +18,6 @@
 # Explicitly set GENERATED property https://gitlab.kitware.com/cmake/cmake/issues/18399
 set_property(SOURCE ${CMAKE_CURRENT_LIST_DIR}/../include/mxnet-cpp/op.h PROPERTY GENERATED 1)
 
-add_executable(test_regress_label test_regress_label.cpp)
-target_link_libraries(test_regress_label mxnet_cpp)
-
-add_executable(lenet lenet.cpp)
-target_link_libraries(lenet mxnet_cpp)
-
-add_executable(lenet_with_mxdataiter lenet_with_mxdataiter.cpp)
-target_link_libraries(lenet_with_mxdataiter mxnet_cpp)
-
-add_executable(alexnet alexnet.cpp)
-target_link_libraries(alexnet mxnet_cpp)
-
-add_executable(charRNN charRNN.cpp)
-target_link_libraries(charRNN mxnet_cpp)
-
-add_executable(googlenet googlenet.cpp)
-target_link_libraries(googlenet mxnet_cpp)
-
-add_executable(inception_bn inception_bn.cpp)
-target_link_libraries(inception_bn mxnet_cpp)
-
-add_executable(mlp mlp.cpp)
-target_link_libraries(mlp mxnet_cpp)
-
-add_executable(mlp_cpu mlp_cpu.cpp)
-target_link_libraries(mlp_cpu mxnet_cpp)
-
-add_executable(mlp_gpu mlp_gpu.cpp)
-target_link_libraries(mlp_gpu mxnet_cpp)
-
-add_executable(resnet resnet.cpp)
-target_link_libraries(resnet mxnet_cpp)
-
-
 if(MSVC)
   add_custom_target(cpp_package_deploy_library ALL
     DEPENDS mxnet
diff --git a/cpp-package/example/README.md b/cpp-package/example/README.md
index 555316dd1ac3..208532e0066e 100644
--- a/cpp-package/example/README.md
+++ b/cpp-package/example/README.md
@@ -35,97 +35,3 @@ By default, the examples are built to be run on GPU. To build examples to run on
 
 The examples that are built to be run on GPU may not work on the non-GPU machines.
 The makefile will also download the necessary data files and store in a data folder. (The download will take couple of minutes, but will be done only once on a fresh installation.)
-
-
-## Examples demonstrating training workflow
-
-This directory contains following examples. In order to run the examples, ensure that the path to the MXNet shared library is added to the OS specific environment variable viz. **LD\_LIBRARY\_PATH** for Linux, Mac and Ubuntu OS and **PATH** for Windows OS. For example `export LD_LIBRARY_PATH=/usr/local/cuda/lib64:/home/ubuntu/incubator-mxnet/lib` on ubuntu using gpu.
-
-### [alexnet.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/alexnet.cpp>)
-
-The example implements the C++ version of AlexNet. The networks trains on MNIST data. The number of epochs can be specified as a command line argument. For example to train with 10 epochs use the following:
-
-```
-build/alexnet 10
-```
-
-### [googlenet.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/googlenet.cpp>)
-
-The code implements a GoogLeNet/Inception network using the C++ API. The example uses MNIST data to train the network. By default, the example trains the model for 100 epochs. The number of epochs can also be specified in the command line. For example, to train the model for 10 epochs use the following:
-
-```
-build/googlenet 10
-```
-
-### [mlp.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/mlp.cpp>)
-
-The code implements a multilayer perceptron from scratch. The example creates its own dummy data to train the model. The example does not require command line parameters. It trains the model for 20,000 epochs.
-To run the example use the following command:
-
-```
-build/mlp
-```
-
-### [mlp_cpu.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/mlp_cpu.cpp>)
-
-The code implements a multilayer perceptron to train the MNIST data. The code demonstrates the use of "SimpleBind"  C++ API and MNISTIter. The example is designed to work on CPU. The example does not require command line parameters.
-To run the example use the following command:
-
-```
-build/mlp_cpu
-```
-
-### [mlp_gpu.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/mlp_gpu.cpp>)
-
-The code implements a multilayer perceptron to train the MNIST data. The code demonstrates the use of the "SimpleBind"  C++ API and MNISTIter. The example is designed to work on GPU. The example does not require command line arguments. To run the example execute following command:
-
-```
-build/mlp_gpu
-```
-
-### [mlp_csv.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/mlp_csv.cpp>)
-
-The code implements a multilayer perceptron to train the MNIST data. The code demonstrates the use of the "SimpleBind"  C++ API and CSVIter. The CSVIter can iterate data that is in CSV format. The example can be run on CPU or GPU. The example usage is as follows:
-
-```
-build/mlp_csv --train data/mnist_data/mnist_train.csv --test data/mnist_data/mnist_test.csv --epochs 10 --batch_size 100 --hidden_units "128 64 64" --gpu
-```
-* To get the `mnist_training_set.csv` and `mnist_test_set.csv` please run the following command:
-```python
-# in incubator-mxnet/cpp-package/example directory
-python mnist_to_csv.py ./data/mnist_data/train-images-idx3-ubyte ./data/mnist_data/train-labels-idx1-ubyte ./data/mnist_data/mnist_train.csv 60000
-python mnist_to_csv.py ./data/mnist_data/t10k-images-idx3-ubyte ./data/mnist_data/t10k-labels-idx1-ubyte ./data/mnist_data/mnist_test.csv 10000
-```
-
-### [resnet.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/resnet.cpp>)
-
-The code implements a resnet model using the C++ API. The model is used to train MNIST data. The number of epochs for training the model can be specified on the command line. By default, model is trained for 100 epochs. For example, to train with 10 epochs use the following command:
-
-```
-build/resnet 10
-```
-
-### [lenet.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/lenet.cpp>)
-
-The code implements a lenet model using the C++ API. It uses MNIST training data in CSV format to train the network. The example does not use built-in CSVIter to read the data from CSV file. The number of epochs can be specified on the command line. By default, the mode is trained for 100,000 epochs. For example, to train with 10 epochs use the following command:
-
-```
-build/lenet 10
-```
-### [lenet\_with\_mxdataiter.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/mlp_cpu.cpp>)
-
-The code implements a lenet model using the C++ API. It uses MNIST training data to train the network. The example uses built-in MNISTIter to read the data. The number of epochs can be specified on the command line. By default, the mode is trained for 100 epochs. For example, to train with 10 epochs use the following command:
-
-```
-build/lenet_with_mxdataiter 10
-```
-
-In addition, there is `run_lenet_with_mxdataiter.sh` that downloads the mnist data and run `lenet_with_mxdataiter` example.
-
-### [inception_bn.cpp](<https://github.com/apache/incubator-mxnet/blob/master/cpp-package/example/inception_bn.cpp>)
-
-The code implements an Inception network using the C++ API with batch normalization. The example uses MNIST data to train the network. The model trains for 100 epochs. The example can be run by executing the following command:
-
-```
-build/inception_bn
-```
diff --git a/cpp-package/example/alexnet.cpp b/cpp-package/example/alexnet.cpp
deleted file mode 100644
index 1c182182c1a5..000000000000
--- a/cpp-package/example/alexnet.cpp
+++ /dev/null
@@ -1,358 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- */
-#include <iostream>
-#include <map>
-#include <string>
-#include <fstream>
-#include <cstdlib>
-#include "utils.h"
-#include "mxnet-cpp/MxNetCpp.h"
-
-using namespace mxnet::cpp;
-
-Symbol AlexnetSymbol(int num_classes) {
-  auto input_data = Symbol::Variable("data");
-  auto target_label = Symbol::Variable("label");
-  /*stage 1*/
-  auto conv1 = Operator("Convolution")
-                   .SetParam("kernel", Shape(11, 11))
-                   .SetParam("num_filter", 96)
-                   .SetParam("stride", Shape(4, 4))
-                   .SetParam("dilate", Shape(1, 1))
-                   .SetParam("pad", Shape(0, 0))
-                   .SetParam("num_group", 1)
-                   .SetParam("workspace", 512)
-                   .SetParam("no_bias", false)
-                   .SetInput("data", input_data)
-                   .CreateSymbol("conv1");
-  auto relu1 = Operator("Activation")
-                   .SetParam("act_type", "relu") /*relu,sigmoid,softrelu,tanh */
-                   .SetInput("data", conv1)
-                   .CreateSymbol("relu1");
-  auto pool1 = Operator("Pooling")
-                   .SetParam("kernel", Shape(3, 3))
-                   .SetParam("pool_type", "max") /*avg,max,sum */
-                   .SetParam("global_pool", false)
-                   .SetParam("stride", Shape(2, 2))
-                   .SetParam("pad", Shape(0, 0))
-                   .SetInput("data", relu1)
-                   .CreateSymbol("pool1");
-  auto lrn1 = Operator("LRN")
-                  .SetParam("nsize", 5)
-                  .SetParam("alpha", 0.0001)
-                  .SetParam("beta", 0.75)
-                  .SetParam("knorm", 1)
-                  .SetInput("data", pool1)
-                  .CreateSymbol("lrn1");
-  /*stage 2*/
-  auto conv2 = Operator("Convolution")
-                   .SetParam("kernel", Shape(5, 5))
-                   .SetParam("num_filter", 256)
-                   .SetParam("stride", Shape(1, 1))
-                   .SetParam("dilate", Shape(1, 1))
-                   .SetParam("pad", Shape(2, 2))
-                   .SetParam("num_group", 1)
-                   .SetParam("workspace", 512)
-                   .SetParam("no_bias", false)
-                   .SetInput("data", lrn1)
-                   .CreateSymbol("conv2");
-  auto relu2 = Operator("Activation")
-                   .SetParam("act_type", "relu") /*relu,sigmoid,softrelu,tanh */
-                   .SetInput("data", conv2)
-                   .CreateSymbol("relu2");
-  auto pool2 = Operator("Pooling")
-                   .SetParam("kernel", Shape(3, 3))
-                   .SetParam("pool_type", "max") /*avg,max,sum */
-                   .SetParam("global_pool", false)
-                   .SetParam("stride", Shape(2, 2))
-                   .SetParam("pad", Shape(0, 0))
-                   .SetInput("data", relu2)
-                   .CreateSymbol("pool2");
-  auto lrn2 = Operator("LRN")
-                  .SetParam("nsize", 5)
-                  .SetParam("alpha", 0.0001)
-                  .SetParam("beta", 0.75)
-                  .SetParam("knorm", 1)
-                  .SetInput("data", pool2)
-                  .CreateSymbol("lrn2");
-  /*stage 3*/
-  auto conv3 = Operator("Convolution")
-                   .SetParam("kernel", Shape(3, 3))
-                   .SetParam("num_filter", 384)
-                   .SetParam("stride", Shape(1, 1))
-                   .SetParam("dilate", Shape(1, 1))
-                   .SetParam("pad", Shape(1, 1))
-                   .SetParam("num_group", 1)
-                   .SetParam("workspace", 512)
-                   .SetParam("no_bias", false)
-                   .SetInput("data", lrn2)
-                   .CreateSymbol("conv3");
-  auto relu3 = Operator("Activation")
-                   .SetParam("act_type", "relu") /*relu,sigmoid,softrelu,tanh */
-                   .SetInput("data", conv3)
-                   .CreateSymbol("relu3");
-  auto conv4 = Operator("Convolution")
-                   .SetParam("kernel", Shape(3, 3))
-                   .SetParam("num_filter", 384)
-                   .SetParam("stride", Shape(1, 1))
-                   .SetParam("dilate", Shape(1, 1))
-                   .SetParam("pad", Shape(1, 1))
-                   .SetParam("num_group", 1)
-                   .SetParam("workspace", 512)
-                   .SetParam("no_bias", false)
-                   .SetInput("data", relu3)
-                   .CreateSymbol("conv4");
-  auto relu4 = Operator("Activation")
-                   .SetParam("act_type", "relu") /*relu,sigmoid,softrelu,tanh */
-                   .SetInput("data", conv4)
-                   .CreateSymbol("relu4");
-  auto conv5 = Operator("Convolution")
-                   .SetParam("kernel", Shape(3, 3))
-                   .SetParam("num_filter", 256)
-                   .SetParam("stride", Shape(1, 1))
-                   .SetParam("dilate", Shape(1, 1))
-                   .SetParam("pad", Shape(1, 1))
-                   .SetParam("num_group", 1)
-                   .SetParam("workspace", 512)
-                   .SetParam("no_bias", false)
-                   .SetInput("data", relu4)
-                   .CreateSymbol("conv5");
-  auto relu5 = Operator("Activation")
-                   .SetParam("act_type", "relu")
-                   .SetInput("data", conv5)
-                   .CreateSymbol("relu5");
-  auto pool3 = Operator("Pooling")
-                   .SetParam("kernel", Shape(3, 3))
-                   .SetParam("pool_type", "max")
-                   .SetParam("global_pool", false)
-                   .SetParam("stride", Shape(2, 2))
-                   .SetParam("pad", Shape(0, 0))
-                   .SetInput("data", relu5)
-                   .CreateSymbol("pool3");
-  /*stage4*/
-  auto flatten =
-      Operator("Flatten").SetInput("data", pool3).CreateSymbol("flatten");
-  auto fc1 = Operator("FullyConnected")
-                 .SetParam("num_hidden", 4096)
-                 .SetParam("no_bias", false)
-                 .SetInput("data", flatten)
-                 .CreateSymbol("fc1");
-  auto relu6 = Operator("Activation")
-                   .SetParam("act_type", "relu")
-                   .SetInput("data", fc1)
-                   .CreateSymbol("relu6");
-  auto dropout1 = Operator("Dropout")
-                      .SetParam("p", 0.5)
-                      .SetInput("data", relu6)
-                      .CreateSymbol("dropout1");
-  /*stage5*/
-  auto fc2 = Operator("FullyConnected")
-                 .SetParam("num_hidden", 4096)
-                 .SetParam("no_bias", false)
-                 .SetInput("data", dropout1)
-                 .CreateSymbol("fc2");
-  auto relu7 = Operator("Activation")
-                   .SetParam("act_type", "relu")
-                   .SetInput("data", fc2)
-                   .CreateSymbol("relu7");
-  auto dropout2 = Operator("Dropout")
-                      .SetParam("p", 0.5)
-                      .SetInput("data", relu7)
-                      .CreateSymbol("dropout2");
-  /*stage6*/
-  auto fc3 = Operator("FullyConnected")
-                 .SetParam("num_hidden", num_classes)
-                 .SetParam("no_bias", false)
-                 .SetInput("data", dropout2)
-                 .CreateSymbol("fc3");
-  auto softmax = Operator("SoftmaxOutput")
-                     .SetParam("grad_scale", 1)
-                     .SetParam("ignore_label", -1)
-                     .SetParam("multi_output", false)
-                     .SetParam("use_ignore", false)
-                     .SetParam("normalization", "null") /*batch,null,valid */
-                     .SetInput("data", fc3)
-                     .SetInput("label", target_label)
-                     .CreateSymbol("softmax");
-  return softmax;
-}
-
-NDArray ResizeInput(NDArray data, const Shape new_shape) {
-  NDArray pic = data.Reshape(Shape(0, 1, 28, 28));
-  NDArray pic_1channel;
-  Operator("_contrib_BilinearResize2D")
-    .SetParam("height", new_shape[2])
-    .SetParam("width", new_shape[3])
-    (pic).Invoke(pic_1channel);
-  NDArray output;
-  Operator("tile")
-    .SetParam("reps", Shape(1, 3, 1, 1))
-    (pic_1channel).Invoke(output);
-  return output;
-}
-
-int main(int argc, char const *argv[]) {
-  /*basic config*/
-  int max_epo = argc > 1 ? strtol(argv[1], nullptr, 10) : 100;
-  float learning_rate = 1e-4;
-  float weight_decay = 1e-4;
-
-  /*context*/
-  auto ctx = Context::cpu();
-  int num_gpu;
-  MXGetGPUCount(&num_gpu);
-  int batch_size = 32;
-#if !MXNET_USE_CPU
-  if (num_gpu > 0) {
-    ctx = Context::gpu();
-    batch_size = 256;
-  }
-#endif
-
-  TRY
-  /*net symbol*/
-  auto Net = AlexnetSymbol(10);
-
-  /*args_map and aux_map is used for parameters' saving*/
-  std::map<std::string, NDArray> args_map;
-  std::map<std::string, NDArray> aux_map;
-
-  /*we should tell mxnet the shape of data and label*/
-  const Shape data_shape = Shape(batch_size, 3, 256, 256),
-              label_shape = Shape(batch_size);
-  args_map["data"] = NDArray(data_shape, ctx);
-  args_map["label"] = NDArray(label_shape, ctx);
-
-  /*with data and label, executor can be generated automatically*/
-  auto *exec = Net.SimpleBind(ctx, args_map);
-  auto arg_names = Net.ListArguments();
-  aux_map = exec->aux_dict();
-  args_map = exec->arg_dict();
-
-  /*if fine tune from some pre-trained model, we should load the parameters*/
-  // NDArray::Load("./model/alex_params_3", nullptr, &args_map);
-  /*else, we should use initializer Xavier to init the params*/
-  Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2.34);
-  for (auto &arg : args_map) {
-    /*be careful here, the arg's name must has some specific ends or starts for
-     * initializer to call*/
-    xavier(arg.first, &arg.second);
-  }
-
-  /*these binary files should be generated using im2rc tools, which can be found
-   * in mxnet/bin*/
-  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
-                                "./data/mnist_data/train-labels-idx1-ubyte",
-                                "./data/mnist_data/t10k-images-idx3-ubyte",
-                                "./data/mnist_data/t10k-labels-idx1-ubyte"
-                              };
-
-  auto train_iter =  MXDataIter("MNISTIter");
-  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
-    return 1;
-  }
-
-  auto val_iter = MXDataIter("MNISTIter");
-  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
-    return 1;
-  }
-
-  Optimizer* opt = OptimizerRegistry::Find("sgd");
-  opt->SetParam("momentum", 0.9)
-     ->SetParam("rescale_grad", 1.0 / batch_size)
-     ->SetParam("clip_gradient", 10)
-     ->SetParam("lr", learning_rate)
-     ->SetParam("wd", weight_decay);
-
-  Accuracy acu_train, acu_val;
-  LogLoss logloss_train, logloss_val;
-  for (int epoch = 0; epoch < max_epo; ++epoch) {
-    LG << "Train Epoch: " << epoch;
-    /*reset the metric every epoch*/
-    acu_train.Reset();
-    /*reset the data iter every epoch*/
-    train_iter.Reset();
-    int iter = 0;
-    while (train_iter.Next()) {
-      auto batch = train_iter.GetDataBatch();
-      /*use copyto to feed new data and label to the executor*/
-      ResizeInput(batch.data, data_shape).CopyTo(&args_map["data"]);
-      batch.label.CopyTo(&args_map["label"]);
-      exec->Forward(true);
-      exec->Backward();
-      for (size_t i = 0; i < arg_names.size(); ++i) {
-        if (arg_names[i] == "data" || arg_names[i] == "label") continue;
-        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
-      }
-
-      NDArray::WaitAll();
-      acu_train.Update(batch.label, exec->outputs[0]);
-      logloss_train.Reset();
-      logloss_train.Update(batch.label, exec->outputs[0]);
-      ++iter;
-      LG << "EPOCH: " << epoch << " ITER: " << iter
-         << " Train Accuracy: " << acu_train.Get()
-         << " Train Loss: " << logloss_train.Get();
-    }
-    LG << "EPOCH: " << epoch << " Train Accuracy: " << acu_train.Get();
-
-    LG << "Val Epoch: " << epoch;
-    acu_val.Reset();
-    val_iter.Reset();
-    logloss_val.Reset();
-    iter = 0;
-    while (val_iter.Next()) {
-      auto batch = val_iter.GetDataBatch();
-      ResizeInput(batch.data, data_shape).CopyTo(&args_map["data"]);
-      batch.label.CopyTo(&args_map["label"]);
-      exec->Forward(false);
-      NDArray::WaitAll();
-      acu_val.Update(batch.label, exec->outputs[0]);
-      logloss_val.Update(batch.label, exec->outputs[0]);
-      LG << "EPOCH: " << epoch << " ITER: " << iter << " Val Accuracy: " << acu_val.Get();
-      ++iter;
-    }
-    LG << "EPOCH: " << epoch << " Val Accuracy: " << acu_val.Get();
-    LG << "EPOCH: " << epoch << " Val LogLoss: " << logloss_val.Get();
-
-    /*save the parameters*/
-    std::stringstream ss;
-    ss << epoch;
-    std::string epoch_str;
-    ss >> epoch_str;
-    std::string save_path_param = "alex_param_" + epoch_str;
-    auto save_args = args_map;
-    /*we do not want to save the data and label*/
-    save_args.erase(save_args.find("data"));
-    save_args.erase(save_args.find("label"));
-    /*the alexnet does not get any aux array, so we do not need to save
-     * aux_map*/
-    LG << "EPOCH: " << epoch << " Saving to..." << save_path_param;
-    NDArray::Save(save_path_param, save_args);
-  }
-  /*don't foget to release the executor*/
-  delete exec;
-  delete opt;
-  MXNotifyShutdown();
-  CATCH
-  return 0;
-}
diff --git a/cpp-package/example/charRNN.cpp b/cpp-package/example/charRNN.cpp
deleted file mode 100644
index 3d90e2e9ed9f..000000000000
--- a/cpp-package/example/charRNN.cpp
+++ /dev/null
@@ -1,756 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Hua Zhang mz24cn@hotmail.com
- * The code implements C++ version charRNN for mxnet\example\rnn\char-rnn.ipynb with MXNet.cpp API.
- * The generated params file is compatiable with python version.
- * train() and predict() has been verified with original data samples.
- * 2017/1/23:
- * Add faster version charRNN based on built-in cuDNN RNN operator, 10 times faster.
- * Add time major computation graph, although no substantial performance difference.
- * Support continuing training from last params file.
- * Rename params file epoch number starts from zero.
- */
-
-#if _MSC_VER
-#pragma warning(disable: 4996)  // VS2015 complains on 'std::copy' ...
-#endif
-#include <cstring>
-#include <iostream>
-#include <fstream>
-#include <unordered_map>
-#include <vector>
-#include <string>
-#include <tuple>
-#include <algorithm>
-#include <functional>
-#include <thread>
-#include <chrono>
-#include "mxnet-cpp/MxNetCpp.h"
-#include "utils.h"
-
-using namespace mxnet::cpp;
-
-struct LSTMState {
-  Symbol C;
-  Symbol h;
-};
-
-struct LSTMParam {
-  Symbol i2h_weight;
-  Symbol i2h_bias;
-  Symbol h2h_weight;
-  Symbol h2h_bias;
-};
-
-bool TIME_MAJOR = true;
-
-// LSTM Cell symbol
-LSTMState LSTM(int num_hidden, const Symbol& indata, const LSTMState& prev_state,
-    const LSTMParam& param, int seqidx, int layeridx, mx_float dropout = 0) {
-  auto input = dropout > 0? Dropout(indata, dropout) : indata;
-  auto prefix = std::string("t") + std::to_string(seqidx) + "_l" + std::to_string(layeridx);
-  auto i2h = FullyConnected(prefix + "_i2h", input, param.i2h_weight, param.i2h_bias,
-      num_hidden * 4);
-  auto h2h = FullyConnected(prefix + "_h2h", prev_state.h, param.h2h_weight, param.h2h_bias,
-      num_hidden * 4);
-  auto gates = i2h + h2h;
-  auto slice_gates = SliceChannel(prefix + "_slice", gates, 4);
-  auto in_gate = Activation(slice_gates[0], ActivationActType::kSigmoid);
-  auto in_transform = Activation(slice_gates[1], ActivationActType::kTanh);
-  auto forget_gate = Activation(slice_gates[2], ActivationActType::kSigmoid);
-  auto out_gate = Activation(slice_gates[3], ActivationActType::kSigmoid);
-
-  LSTMState state;
-  state.C = (forget_gate * prev_state.C) + (in_gate * in_transform);
-  state.h = out_gate * Activation(state.C, ActivationActType::kTanh);
-  return state;
-}
-
-Symbol LSTMUnroll(int num_lstm_layer, int sequence_length, int input_dim,
-        int num_hidden, int num_embed, mx_float dropout = 0) {
-  auto isTrain = sequence_length > 1;
-  auto data = Symbol::Variable("data");
-  if (TIME_MAJOR && isTrain)
-    data = transpose(data);
-  auto embed_weight = Symbol::Variable("embed_weight");
-  auto embed = Embedding("embed", data, embed_weight, input_dim, num_embed);
-  auto wordvec = isTrain? SliceChannel(embed, sequence_length, TIME_MAJOR? 0 : 1, true) : embed;
-
-  std::vector<LSTMState> last_states;
-  std::vector<LSTMParam> param_cells;
-  for (int l = 0; l < num_lstm_layer; l++) {
-    std::string layer = "l" + std::to_string(l);
-    LSTMParam param;
-    param.i2h_weight = Symbol::Variable(layer + "_i2h_weight");
-    param.i2h_bias = Symbol::Variable(layer + "_i2h_bias");
-    param.h2h_weight = Symbol::Variable(layer + "_h2h_weight");
-    param.h2h_bias = Symbol::Variable(layer + "_h2h_bias");
-    param_cells.push_back(param);
-    LSTMState state;
-    state.C = Symbol::Variable(layer + "_init_c");
-    state.h = Symbol::Variable(layer + "_init_h");
-    last_states.push_back(state);
-  }
-
-  std::vector<Symbol> hidden_all;
-  for (int i = 0; i < sequence_length; i++) {
-    auto hidden = wordvec[i];
-    for (int layer = 0; layer < num_lstm_layer; layer++) {
-      double dp_ratio = layer == 0? 0 : dropout;
-      auto next_state = LSTM(num_hidden, hidden, last_states[layer], param_cells[layer],
-          i, layer, dp_ratio);
-      hidden = next_state.h;
-      last_states[layer] = next_state;
-    }
-    if (dropout > 0)
-      hidden = Dropout(hidden, dropout);
-    hidden_all.push_back(hidden);
-  }
-
-  auto hidden_concat = isTrain? Concat(hidden_all, hidden_all.size(), 0) : hidden_all[0];
-  auto cls_weight = Symbol::Variable("cls_weight");
-  auto cls_bias = Symbol::Variable("cls_bias");
-  auto pred = FullyConnected("pred", hidden_concat, cls_weight, cls_bias, input_dim);
-
-  auto label = Symbol::Variable("softmax_label");
-  label = transpose(label);
-  label = Reshape(label, Shape(), false, Shape(0), false);  // -1: infer from graph
-  auto sm = SoftmaxOutput("softmax", pred, label);
-  if (isTrain)
-    return sm;
-
-  std::vector<Symbol> outputs = { sm };
-  for (auto& state : last_states) {
-    outputs.push_back(state.C);
-    outputs.push_back(state.h);
-  }
-  return Symbol::Group(outputs);
-}
-
-// Currently mxnet GPU version RNN operator is implemented via *fast* NVIDIA cuDNN.
-Symbol LSTMWithBuiltInRNNOp(int num_lstm_layer, int sequence_length, int input_dim,
- int num_hidden, int num_embed, mx_float dropout = 0) {
-  auto isTrain = sequence_length > 1;
-  auto data = Symbol::Variable("data");
-  if (TIME_MAJOR && isTrain)
-    data = transpose(data);
-
-  auto embed_weight = Symbol::Variable("embed_weight");
-  auto embed = Embedding("embed", data, embed_weight, input_dim, num_embed);
-  auto label = Symbol::Variable("softmax_label");
-  label = transpose(label);
-  label = Reshape(label, Shape(), false,
-                  Shape(0), false);  // FullyConnected requires one dimension
-  if (!TIME_MAJOR && isTrain)
-    embed = SwapAxis(embed, 0, 1);  // Change to time-major as cuDNN requires
-
-  // We need not do the SwapAxis op as python version does. Direct and better performance in C++!
-  auto rnn_h_init = Symbol::Variable("LSTM_init_h");
-  auto rnn_c_init = Symbol::Variable("LSTM_init_c");
-  auto rnn_params = Symbol::Variable("LSTM_parameters");  // See explanations near RNNXavier class
-  auto variable_sequence_length = Symbol::Variable("sequence_length");
-  auto rnn = RNN(embed, rnn_params, rnn_h_init, rnn_c_init, variable_sequence_length, num_hidden,
-                 num_lstm_layer, RNNMode::kLstm, false, dropout, !isTrain);
-  auto hidden = Reshape(rnn[0], Shape(), false, Shape(0, num_hidden), false);
-
-  auto cls_weight = Symbol::Variable("cls_weight");
-  auto cls_bias = Symbol::Variable("cls_bias");
-  auto pred = FullyConnected("pred", hidden, cls_weight, cls_bias, input_dim);
-  /*In rnn-time-major/rnn_cell_demo.py, the author claimed time-major version speeds up
-   * 1.5~2 times versus batch version. I doubts on the conclusion. In my test, the performance
-   * of both codes are almost same. In fact, there are no substantially differences between
-   * two codes. They are both based on time major cuDNN, the computation graph only differs
-   * slightly on the choices of where to put Reshape/SwapAxis/transpose operation. Here I don't
-   * use Reshape on pred and keep label shape on SoftmaxOutput like time major version code,
-   * but Reshape on label for simplification. It doesn't make influence on performacne. */
-
-  auto sm = SoftmaxOutput("softmax", pred, label);
-  if (isTrain)
-    return sm;
-  else
-    return Symbol::Group({ sm, rnn[1/*RNNOpOutputs::kStateOut=1*/],
-    rnn[2/*RNNOpOutputs::kStateCellOut=2*/] });
-}
-
-class Shuffler {
-  std::vector<int> sequence;
- public:
-  explicit Shuffler(int size) : sequence(size) {
-    int* p = sequence.data();
-    for (int i = 0; i < size; i++)
-      *p++ = i;
-  }
-  void shuffle(std::function<void(int, int)> lambda = nullptr) {
-    random_shuffle(sequence.begin(), sequence.end());
-    int n = 0;
-    if (lambda != nullptr)
-      for (int i : sequence)
-        lambda(n++, i);
-  }
-  const int* data() {
-    return sequence.data();
-  }
-};
-
-class BucketSentenceIter : public DataIter {
-  Shuffler* random;
-  int batch, current, end;
-  unsigned int sequence_length;
-  Context device;
-  std::vector<std::vector<mx_float>> sequences;
-  std::vector<wchar_t> index2chars;
-  std::unordered_map<wchar_t, mx_float> charIndices;
-
- public:
-  BucketSentenceIter(std::string filename, int minibatch, Context context) : batch(minibatch),
-  current(-1), device(context) {
-    auto content = readContent(filename);
-    buildCharIndex(content);
-    sequences = convertTextToSequences(content, '\n');
-
-    int N = sequences.size() / batch * batch;  // total used samples
-    sequences.resize(N);
-    sort(sequences.begin(), sequences.end(), [](const std::vector<mx_float>& a,
-        const std::vector<mx_float>& b) { return a.size() < b.size(); });
-
-    sequence_length = sequences.back().size();
-    random = new Shuffler(N);
-    // We still can get random results if call Reset() firstly
-//    std::vector<vector<mx_float>>* target = &sequences;
-//    random->shuffle([target](int n, int i) { (*target)[n].swap((*target)[i]); });
-    end = N / batch;
-  }
-  virtual ~BucketSentenceIter() {
-    delete random;
-  }
-
-  unsigned int maxSequenceLength() {
-    return sequence_length;
-  }
-
-  size_t characterSize() {
-    return charIndices.size();
-  }
-
-  virtual bool Next(void) {
-    return ++current < end;
-  }
-  virtual NDArray GetData(void) {
-    const int* indices = random->data();
-    mx_float *data = new mx_float[sequence_length * batch], *pdata = data;
-
-    for (int i = current * batch, end = i + batch; i < end; i++) {
-      memcpy(pdata, sequences[indices[i]].data(), sequences[indices[i]].size() * sizeof(mx_float));
-      if (sequences[indices[i]].size() < sequence_length)
-        memset(pdata + sequences[indices[i]].size(), 0,
-            (sequence_length - sequences[indices[i]].size()) * sizeof(mx_float));
-      pdata += sequence_length;
-    }
-    NDArray array(Shape(batch, sequence_length), device, false);
-    array.SyncCopyFromCPU(data, batch * sequence_length);
-    return array;
-  }
-  virtual NDArray GetLabel(void) {
-    const int* indices = random->data();
-    mx_float *label = new mx_float[sequence_length * batch], *plabel = label;
-
-    for (int i = current * batch, end = i + batch; i < end; i++) {
-      memcpy(plabel, sequences[indices[i]].data() + 1,
-          (sequences[indices[i]].size() - 1) * sizeof(mx_float));
-      memset(plabel + sequences[indices[i]].size() - 1, 0,
-          (sequence_length - sequences[indices[i]].size() + 1) * sizeof(mx_float));
-      plabel += sequence_length;
-    }
-    NDArray array(Shape(batch, sequence_length), device, false);
-    array.SyncCopyFromCPU(label, batch * sequence_length);
-    return array;
-  }
-  virtual int GetPadNum(void) {
-    return sequence_length - sequences[random->data()[current * batch]].size();
-  }
-  virtual std::vector<int> GetIndex(void) {
-    const int* indices = random->data();
-    std::vector<int> list(indices + current * batch, indices + current * batch + batch);
-    return list;
-  }
-  virtual void BeforeFirst(void) {
-    current = -1;
-    random->shuffle(nullptr);
-  }
-
-  std::wstring readContent(const std::string file) {
-    std::wifstream ifs(file, std::ios::binary);
-    if (ifs) {
-      std::wostringstream os;
-      os << ifs.rdbuf();
-      return os.str();
-    }
-    return L"";
-  }
-
-  void buildCharIndex(const std::wstring& content) {
-  // This version buildCharIndex() Compatiable with python version char_rnn dictionary
-    int n = 1;
-    charIndices['\0'] = 0;  // padding character
-    index2chars.push_back(0);  // padding character index
-    for (auto c : content)
-      if (charIndices.find(c) == charIndices.end()) {
-        charIndices[c] = n++;
-        index2chars.push_back(c);
-      }
-  }
-//  void buildCharIndex(wstring& content) {
-//    for (auto c : content)
-//      charIndices[c]++; // char-frequency map; then char-index map
-//    std::vector<tuple<wchar_t, mx_float>> characters;
-//    for (auto& iter : charIndices)
-//      characters.push_back(make_tuple(iter.first, iter.second));
-//    sort(characters.begin(), characters.end(), [](const tuple<wchar_t, mx_float>& a,
-//      const tuple<wchar_t, mx_float>& b) { return get<1>(a) > get<1>(b); });
-//    mx_float index = 1; //0 is left for zero-padding
-//    index2chars.clear();
-//    index2chars.push_back(0); //zero-padding
-//    for (auto& t : characters) {
-//      charIndices[get<0>(t)] = index++;
-//      index2chars.push_back(get<0>(t));
-//    }s
-//  }
-
-  inline wchar_t character(int i) {
-    return index2chars[i];
-  }
-
-  inline mx_float index(wchar_t c) {
-    return charIndices[c];
-  }
-
-  void saveCharIndices(const std::string file) {
-    std::wofstream ofs(file, std::ios::binary);
-    if (ofs) {
-      ofs.write(index2chars.data() + 1, index2chars.size() - 1);
-      ofs.close();
-    }
-  }
-
-  static std::tuple<std::unordered_map<wchar_t, mx_float>, std::vector<wchar_t>> loadCharIndices(
-      const std::string file) {
-    std::wifstream ifs(file, std::ios::binary);
-    std::unordered_map<wchar_t, mx_float> map;
-    std::vector<wchar_t> chars;
-    if (ifs) {
-      std::wostringstream os;
-      os << ifs.rdbuf();
-      int n = 1;
-      map[L'\0'] = 0;
-      chars.push_back(L'\0');
-      for (auto c : os.str()) {
-        map[c] = (mx_float) n++;
-        chars.push_back(c);
-      }
-    }
-    // Note: Can't use {} because this would hit the explicit constructor
-    return std::tuple<std::unordered_map<wchar_t, mx_float>, std::vector<wchar_t>>(map, chars);
-  }
-
-  std::vector<std::vector<mx_float>>
-  convertTextToSequences(const std::wstring& content, wchar_t spliter) {
-    std::vector<std::vector<mx_float>> sequences;
-    sequences.push_back(std::vector<mx_float>());
-    for (auto c : content)
-      if (c == spliter && !sequences.back().empty())
-        sequences.push_back(std::vector<mx_float>());
-      else
-        sequences.back().push_back(charIndices[c]);
-    return sequences;
-  }
-};
-
-void OutputPerplexity(NDArray* labels, NDArray* output) {
-  std::vector<mx_float> charIndices, a;
-  labels->SyncCopyToCPU(&charIndices, 0L);  // 0L indicates all
-  output->SyncCopyToCPU(&a, 0L)/*4128*84*/;
-  mx_float loss = 0;
-  int batchSize = labels->GetShape()[0]/*32*/, sequenceLength = labels->GetShape()[1]/*129*/,
-      nSamples = output->GetShape()[0]/*4128*/, vocabSize = output->GetShape()[1]/*84*/;
-  for (int n = 0; n < nSamples; n++) {
-    int row = n % batchSize, column = n / batchSize, labelOffset = column +
-        row * sequenceLength;  // Search based on column storage: labels.T
-    mx_float safe_value = std::max(1e-10f, a[vocabSize * n +
-                                    static_cast<int>(charIndices[labelOffset])]);
-    loss += -log(safe_value);  // Calculate negative log-likelihood
-  }
-  loss = exp(loss / nSamples);
-  std::cout << "Train-Perplexity=" << loss << std::endl;
-}
-
-void SaveCheckpoint(const std::string filepath, Symbol net, Executor* exe) {
-  std::map<std::string, NDArray> params;
-  for (auto iter : exe->arg_dict())
-    if (iter.first.find("_init_") == std::string::npos
-        && iter.first.rfind("data") != iter.first.length() - 4
-        && iter.first.rfind("label") != iter.first.length() - 5)
-      params.insert({"arg:" + iter.first, iter.second});
-  for (auto iter : exe->aux_dict())
-      params.insert({"aux:" + iter.first, iter.second});
-  NDArray::Save(filepath, params);
-}
-
-void LoadCheckpoint(const std::string filepath, Executor* exe) {
-  std::map<std::string, NDArray> params = NDArray::LoadToMap(filepath);
-  for (auto iter : params) {
-    std::string type = iter.first.substr(0, 4);
-    std::string name = iter.first.substr(4);
-    NDArray target;
-    if (type == "arg:")
-      target = exe->arg_dict()[name];
-    else if (type == "aux:")
-      target = exe->aux_dict()[name];
-    else
-      continue;
-    iter.second.CopyTo(&target);
-  }
-}
-
-int input_dim = 0;/*84*/
-int sequence_length_max = 0;/*129*/
-int num_embed = 256;
-int num_lstm_layer = 3;
-int num_hidden = 512;
-mx_float dropout = 0.2;
-void train(const std::string file, int batch_size, int max_epoch, int start_epoch) {
-  Context device(DeviceType::kGPU, 0);
-  BucketSentenceIter dataIter(file, batch_size, device);
-  std::string prefix = file.substr(0, file.rfind("."));
-  dataIter.saveCharIndices(prefix + ".dictionary");
-
-  input_dim = static_cast<int>(dataIter.characterSize());
-  sequence_length_max = dataIter.maxSequenceLength();
-
-  auto RNN = LSTMUnroll(num_lstm_layer, sequence_length_max, input_dim, num_hidden,
-      num_embed, dropout);
-  std::map<std::string, NDArray> args_map;
-  args_map["data"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
-  args_map["softmax_label"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
-  for (int i = 0; i < num_lstm_layer; i++) {
-    std::string key = "l" + std::to_string(i) + "_init_";
-    args_map[key + "c"] = NDArray(Shape(batch_size, num_hidden), device, false);
-    args_map[key + "h"] = NDArray(Shape(batch_size, num_hidden), device, false);
-  }
-  std::vector<mx_float> zeros(batch_size * num_hidden, 0);
-  // RNN.SimpleBind(device, args_map, {}, {{"data", kNullOp}});
-  Executor* exe = RNN.SimpleBind(device, args_map);
-
-  if (start_epoch == -1) {
-    Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2.34);
-    for (auto &arg : exe->arg_dict())
-      xavier(arg.first, &arg.second);
-  } else {
-    LoadCheckpoint(prefix + "-" + std::to_string(start_epoch) + ".params", exe);
-  }
-  start_epoch++;
-
-  mx_float learning_rate = 0.0002;
-  mx_float weight_decay = 0.000002;
-  Optimizer* opt = OptimizerRegistry::Find("sgd");
-  opt->SetParam("lr", learning_rate)
-     ->SetParam("wd", weight_decay);
-//  opt->SetParam("momentum", 0.9)->SetParam("rescale_grad", 1.0 / batch_size)
-//  ->SetParam("clip_gradient", 10);
-
-  for (int epoch = start_epoch; epoch < max_epoch; ++epoch) {
-    dataIter.Reset();
-    auto tic =  std::chrono::system_clock::now();
-    while (dataIter.Next()) {
-      auto data_batch = dataIter.GetDataBatch();
-      data_batch.data.CopyTo(&exe->arg_dict()["data"]);
-      data_batch.label.CopyTo(&exe->arg_dict()["softmax_label"]);
-      for (int l = 0; l < num_lstm_layer; l++) {
-        std::string key = "l" + std::to_string(l) + "_init_";
-        exe->arg_dict()[key + "c"].SyncCopyFromCPU(zeros);
-        exe->arg_dict()[key + "h"].SyncCopyFromCPU(zeros);
-      }
-      NDArray::WaitAll();
-
-      exe->Forward(true);
-      exe->Backward();
-      for (size_t i = 0; i < exe->arg_arrays.size(); ++i) {
-        opt->Update(i, exe->arg_arrays[i], exe->grad_arrays[i]);
-      }
-
-      NDArray::WaitAll();
-    }
-    auto toc =  std::chrono::system_clock::now();
-    std::cout << "Epoch[" << epoch << "] Time Cost:" <<
-         std::chrono::duration_cast< std::chrono::seconds>(toc - tic).count() << " seconds ";
-    OutputPerplexity(&exe->arg_dict()["softmax_label"], &exe->outputs[0]);
-    std::string filepath = prefix + "-" + std::to_string(epoch) + ".params";
-    SaveCheckpoint(filepath, RNN, exe);
-  }
-
-  delete exe;
-  delete opt;
-}
-
-/*The original example, rnn_cell_demo.py, uses default Xavier as initalizer, which relies on
- * variable name, cannot initialize LSTM_parameters. Thus it was renamed to LSTM_bias,
- * which can be initialized as zero. But it cannot converge after 100 epochs in this corpus
- * example. Using RNNXavier, after 15 oscillating epochs,  it rapidly converges like old
- * LSTMUnroll version. */
-class RNNXavier : public Xavier {
- public:
-  RNNXavier(RandType rand_type = gaussian, FactorType factor_type = avg,
-    float magnitude = 3) : Xavier(rand_type, factor_type, magnitude) {
-  }
-  virtual ~RNNXavier() {}
- protected:
-  virtual void InitDefault(NDArray* arr) {
-    Xavier::InitWeight(arr);
-  }
-};
-
-void trainWithBuiltInRNNOp(const std::string file, int batch_size, int max_epoch, int start_epoch) {
-  Context device(DeviceType::kGPU, 0);
-  BucketSentenceIter dataIter(file, batch_size, device);
-  std::string prefix = file.substr(0, file.rfind("."));
-  dataIter.saveCharIndices(prefix + ".dictionary");
-
-  input_dim = static_cast<int>(dataIter.characterSize());
-  sequence_length_max = dataIter.maxSequenceLength();
-
-  auto RNN = LSTMWithBuiltInRNNOp(num_lstm_layer, sequence_length_max, input_dim, num_hidden,
-      num_embed, dropout);
-  std::map<std::string, NDArray> args_map;
-  args_map["data"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
-  // Avoiding SwapAxis, batch_size is of second dimension.
-  args_map["LSTM_init_c"] = NDArray(Shape(num_lstm_layer, batch_size, num_hidden), device, false);
-  args_map["LSTM_init_h"] = NDArray(Shape(num_lstm_layer, batch_size, num_hidden), device, false);
-  args_map["softmax_label"] = NDArray(Shape(batch_size, sequence_length_max), device, false);
-  std::vector<mx_float> zeros(batch_size * num_lstm_layer * num_hidden, 0);
-  Executor* exe = RNN.SimpleBind(device, args_map);
-
-  if (start_epoch == -1) {
-    RNNXavier xavier = RNNXavier(Xavier::gaussian, Xavier::in, 2.34);
-    for (auto &arg : exe->arg_dict())
-      xavier(arg.first, &arg.second);
-  } else {
-    LoadCheckpoint(prefix + "-" + std::to_string(start_epoch) + ".params", exe);
-  }
-  start_epoch++;
-
-  Optimizer* opt = OptimizerRegistry::Find("sgd");
-//  opt->SetParam("momentum", 0.9)->SetParam("rescale_grad", 1.0 / batch_size)
-//  ->SetParam("clip_gradient", 10);
-
-  for (int epoch = start_epoch; epoch < max_epoch; ++epoch) {
-    dataIter.Reset();
-    auto tic =  std::chrono::system_clock::now();
-    while (dataIter.Next()) {
-      auto data_batch = dataIter.GetDataBatch();
-      data_batch.data.CopyTo(&exe->arg_dict()["data"]);
-      data_batch.label.CopyTo(&exe->arg_dict()["softmax_label"]);
-      exe->arg_dict()["LSTM_init_c"].SyncCopyFromCPU(zeros);
-      exe->arg_dict()["LSTM_init_h"].SyncCopyFromCPU(zeros);
-      NDArray::WaitAll();
-
-      exe->Forward(true);
-      exe->Backward();
-      for (size_t i = 0; i < exe->arg_arrays.size(); ++i) {
-        opt->Update(i, exe->arg_arrays[i], exe->grad_arrays[i]);
-      }
-      NDArray::WaitAll();
-    }
-    auto toc =  std::chrono::system_clock::now();
-    std::cout << "Epoch[" << epoch << "] Time Cost:" <<
-         std::chrono::duration_cast< std::chrono::seconds>(toc - tic).count() << " seconds ";
-    OutputPerplexity(&exe->arg_dict()["softmax_label"], &exe->outputs[0]);
-    std::string filepath = prefix + "-" + std::to_string(epoch) + ".params";
-    SaveCheckpoint(filepath, RNN, exe);
-  }
-
-  delete exe;
-  delete opt;
-}
-
-void predict(std::wstring* ptext, int sequence_length, const std::string param_file,
-    const std::string dictionary_file) {
-  Context device(DeviceType::kGPU, 0);
-  auto results = BucketSentenceIter::loadCharIndices(dictionary_file);
-  auto dictionary = std::get<0>(results);
-  auto charIndices = std::get<1>(results);
-  input_dim = static_cast<int>(charIndices.size());
-  auto RNN = LSTMUnroll(num_lstm_layer, 1, input_dim, num_hidden, num_embed, 0);
-
-  std::map<std::string, NDArray> args_map;
-  args_map["data"] = NDArray(Shape(1, 1), device, false);
-  args_map["softmax_label"] = NDArray(Shape(1, 1), device, false);
-  std::vector<mx_float> zeros(1 * num_hidden, 0);
-  for (int l = 0; l < num_lstm_layer; l++) {
-    std::string key = "l" + std::to_string(l) + "_init_";
-    args_map[key + "c"] = NDArray(Shape(1, num_hidden), device, false);
-    args_map[key + "h"] = NDArray(Shape(1, num_hidden), device, false);
-    args_map[key + "c"].SyncCopyFromCPU(zeros);
-    args_map[key + "h"].SyncCopyFromCPU(zeros);
-  }
-  Executor* exe = RNN.SimpleBind(device, args_map);
-  LoadCheckpoint(param_file, exe);
-
-  mx_float index;
-  wchar_t next = 0;
-  std::vector<mx_float> softmax;
-  softmax.resize(input_dim);
-  for (auto c : *ptext) {
-    exe->arg_dict()["data"].SyncCopyFromCPU(&dictionary[c], 1);
-    exe->Forward(false);
-
-    exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
-    for (int l = 0; l < num_lstm_layer; l++) {
-      std::string key = "l" + std::to_string(l) + "_init_";
-      exe->outputs[l * 2 + 1].CopyTo(&args_map[key + "c"]);
-      exe->outputs[l * 2 + 2].CopyTo(&args_map[key + "h"]);
-    }
-
-    size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin();
-    index = (mx_float) n;
-    next = charIndices[n];
-  }
-  ptext->push_back(next);
-
-  for (int i = 0; i < sequence_length; i++) {
-    exe->arg_dict()["data"].SyncCopyFromCPU(&index, 1);
-    exe->Forward(false);
-
-    exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
-    for (int l = 0; l < num_lstm_layer; l++) {
-      std::string key = "l" + std::to_string(l) + "_init_";
-      exe->outputs[l * 2 + 1].CopyTo(&args_map[key + "c"]);
-      exe->outputs[l * 2 + 2].CopyTo(&args_map[key + "h"]);
-    }
-
-    size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin();
-    index = (mx_float) n;
-    next = charIndices[n];
-    ptext->push_back(next);
-  }
-
-  delete exe;
-}
-
-void predictWithBuiltInRNNOp(std::wstring* ptext, int sequence_length, const std::string param_file,
-  const std::string dictionary_file) {
-  Context device(DeviceType::kGPU, 0);
-  auto results = BucketSentenceIter::loadCharIndices(dictionary_file);
-  auto dictionary = std::get<0>(results);
-  auto charIndices = std::get<1>(results);
-  input_dim = static_cast<int>(charIndices.size());
-  auto RNN = LSTMWithBuiltInRNNOp(num_lstm_layer, 1, input_dim, num_hidden, num_embed, 0);
-
-  std::map<std::string, NDArray> args_map;
-  args_map["data"] = NDArray(Shape(1, 1), device, false);
-  args_map["softmax_label"] = NDArray(Shape(1, 1), device, false);
-  std::vector<mx_float> zeros(1 * num_lstm_layer * num_hidden, 0);
-  // Avoiding SwapAxis, batch_size=1 is of second dimension.
-  args_map["LSTM_init_c"] = NDArray(Shape(num_lstm_layer, 1, num_hidden), device, false);
-  args_map["LSTM_init_h"] = NDArray(Shape(num_lstm_layer, 1, num_hidden), device, false);
-  args_map["LSTM_init_c"].SyncCopyFromCPU(zeros);
-  args_map["LSTM_init_h"].SyncCopyFromCPU(zeros);
-  Executor* exe = RNN.SimpleBind(device, args_map);
-  LoadCheckpoint(param_file, exe);
-
-  mx_float index;
-  wchar_t next = 0;
-  std::vector<mx_float> softmax;
-  softmax.resize(input_dim);
-  for (auto c : *ptext) {
-    exe->arg_dict()["data"].SyncCopyFromCPU(&dictionary[c], 1);
-    exe->Forward(false);
-
-    exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
-    exe->outputs[1].CopyTo(&args_map["LSTM_init_h"]);
-    exe->outputs[2].CopyTo(&args_map["LSTM_init_c"]);
-
-    size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin();
-    index = (mx_float) n;
-    next = charIndices[n];
-  }
-  ptext->push_back(next);
-
-  for (int i = 0; i < sequence_length; i++) {
-    exe->arg_dict()["data"].SyncCopyFromCPU(&index, 1);
-    exe->Forward(false);
-
-    exe->outputs[0].SyncCopyToCPU(softmax.data(), input_dim);
-    exe->outputs[1].CopyTo(&args_map["LSTM_init_h"]);
-    exe->outputs[2].CopyTo(&args_map["LSTM_init_c"]);
-
-    size_t n = max_element(softmax.begin(), softmax.end()) - softmax.begin();
-    index = (mx_float) n;
-    next = charIndices[n];
-    ptext->push_back(next);
-  }
-
-  delete exe;
-}
-
-int main(int argc, char** argv) {
-  if (argc < 5) {
-    std::cout << "Usage for training: charRNN train[BuiltIn][TimeMajor] {corpus file}"
-            " {batch size} {max epoch} [{starting epoch}]" << std::endl;
-    std::cout <<"Usage for prediction: charRNN predict[BuiltIn][TimeMajor] {params file}"
-            " {dictionary file} {beginning of text}" << std::endl;
-    std::cout <<"Note: The {params file} of train/trainBuiltIn/trainTimeMajor/trainBuiltInTimeMajor"
-            " are not compatible with each other." << std::endl;
-    return 0;
-  }
-
-  std::string task = argv[1];
-  bool builtIn = task.find("BuiltIn") != std::string::npos;
-  TIME_MAJOR = task.find("TimeMajor") != std::string::npos;
-  std::cout << "use BuiltIn cuDNN RNN: " << builtIn << std::endl
-         << "use data as TimeMajor: " << TIME_MAJOR << std::endl;
-  TRY
-  if (task.find("train") == 0) {
-    std::cout << "train batch size:      " << argv[3] << std::endl
-           << "train max epoch:       " << argv[4] << std::endl;
-    int start_epoch = argc > 5? atoi(argv[5]) : -1;
-    // this function will generate dictionary file and params file.
-    if (builtIn)
-      trainWithBuiltInRNNOp(argv[2], atoi(argv[3]), atoi(argv[4]), start_epoch);
-    else
-      train(argv[2], atoi(argv[3]), atoi(argv[4]), start_epoch);  // ditto
-  } else if (task.find("predict") == 0) {
-    std::wstring text;  // = L"If there is anyone out there who still doubts ";
-    // Considering of extending to Chinese samples in future, use wchar_t instead of char
-    for (char c : std::string(argv[4]))
-      text.push_back((wchar_t) c);
-    /*Python version predicts text default to random selecltions. Here I didn't write the random
-    code, always choose the 'best' character. So the text length reduced to 600. Longer size often
-    leads to repeated sentances, since training sequence length is only 129 for obama corpus.*/
-    if (builtIn)
-      predictWithBuiltInRNNOp(&text, 600, argv[2], argv[3]);
-    else
-      predict(&text, 600, argv[2], argv[3]);
-    std::wcout << text << std::endl;
-  }
-
-  MXNotifyShutdown();
-  CATCH
-  return 0;
-}
diff --git a/cpp-package/example/googlenet.cpp b/cpp-package/example/googlenet.cpp
deleted file mode 100644
index 7b51f4fde3a7..000000000000
--- a/cpp-package/example/googlenet.cpp
+++ /dev/null
@@ -1,198 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- */
-#include <string>
-#include <vector>
-#include <map>
-#include <fstream>
-#include "utils.h"
-#include "mxnet-cpp/MxNetCpp.h"
-
-using namespace mxnet::cpp;
-
-Symbol ConvFactory(Symbol data, int num_filter,
-                   Shape kernel,
-                   Shape stride = Shape(1, 1),
-                   Shape pad = Shape(0, 0),
-                   const std::string & name = "",
-                   const std::string & suffix = "") {
-  Symbol conv_w("conv_" + name + suffix + "_w"), conv_b("conv_" + name + suffix + "_b");
-
-  Symbol conv = Convolution("conv_" + name + suffix, data,
-                            conv_w, conv_b, kernel,
-                            num_filter, stride, Shape(1, 1), pad);
-  return Activation("relu_" + name + suffix, conv, "relu");
-}
-
-Symbol InceptionFactory(Symbol data, int num_1x1, int num_3x3red,
-                        int num_3x3, int num_d5x5red, int num_d5x5,
-                        PoolingPoolType pool, int proj, const std::string & name) {
-  Symbol c1x1 = ConvFactory(data, num_1x1, Shape(1, 1),
-                            Shape(1, 1), Shape(0, 0), name + "_1x1");
-
-  Symbol c3x3r = ConvFactory(data, num_3x3red, Shape(1, 1),
-                             Shape(1, 1), Shape(0, 0), name + "_3x3", "_reduce");
-
-  Symbol c3x3 = ConvFactory(c3x3r, num_3x3, Shape(3, 3),
-                            Shape(1, 1), Shape(1, 1), name + "_3x3");
-
-  Symbol cd5x5r = ConvFactory(data, num_d5x5red, Shape(1, 1),
-                              Shape(1, 1), Shape(0, 0), name + "_5x5", "_reduce");
-
-  Symbol cd5x5 = ConvFactory(cd5x5r, num_d5x5, Shape(5, 5),
-                             Shape(1, 1), Shape(2, 2), name + "_5x5");
-
-  Symbol pooling = Pooling(name + "_pool", data, Shape(3, 3), pool,
-                           false, false, PoolingPoolingConvention::kValid,
-                           Shape(1, 1), Shape(1, 1));
-
-  Symbol cproj = ConvFactory(pooling, proj, Shape(1, 1),
-                             Shape(1, 1), Shape(0, 0), name + "_proj");
-
-  std::vector<Symbol> lst;
-  lst.push_back(c1x1);
-  lst.push_back(c3x3);
-  lst.push_back(cd5x5);
-  lst.push_back(cproj);
-  return Concat("ch_concat_" + name + "_chconcat", lst, lst.size());
-}
-
-Symbol GoogleNetSymbol(int num_classes) {
-  // data and label
-  Symbol data = Symbol::Variable("data");
-  Symbol data_label = Symbol::Variable("data_label");
-
-  Symbol conv1 = ConvFactory(data, 64, Shape(7, 7), Shape(2, 2), Shape(3, 3), "conv1");
-  Symbol pool1 = Pooling("pool1", conv1, Shape(3, 3), PoolingPoolType::kMax,
-                         false, false, PoolingPoolingConvention::kValid, Shape(2, 2));
-  Symbol conv2 = ConvFactory(pool1, 64, Shape(1, 1), Shape(1, 1),
-                             Shape(0, 0), "conv2");
-  Symbol conv3 = ConvFactory(conv2, 192, Shape(3, 3), Shape(1, 1), Shape(1, 1), "conv3");
-  Symbol pool3 = Pooling("pool3", conv3, Shape(3, 3), PoolingPoolType::kMax,
-                         false, false, PoolingPoolingConvention::kValid, Shape(2, 2));
-
-  Symbol in3a = InceptionFactory(pool3, 64, 96, 128, 16, 32, PoolingPoolType::kMax, 32, "in3a");
-  Symbol in3b = InceptionFactory(in3a, 128, 128, 192, 32, 96, PoolingPoolType::kMax, 64, "in3b");
-  Symbol pool4 = Pooling("pool4", in3b, Shape(3, 3), PoolingPoolType::kMax,
-                         false, false, PoolingPoolingConvention::kValid, Shape(2, 2));
-  Symbol in4a = InceptionFactory(pool4, 192, 96, 208, 16, 48, PoolingPoolType::kMax, 64, "in4a");
-  Symbol in4b = InceptionFactory(in4a, 160, 112, 224, 24, 64, PoolingPoolType::kMax, 64, "in4b");
-  Symbol in4c = InceptionFactory(in4b, 128, 128, 256, 24, 64, PoolingPoolType::kMax, 64, "in4c");
-  Symbol in4d = InceptionFactory(in4c, 112, 144, 288, 32, 64, PoolingPoolType::kMax, 64, "in4d");
-  Symbol in4e = InceptionFactory(in4d, 256, 160, 320, 32, 128, PoolingPoolType::kMax, 128, "in4e");
-  Symbol pool5 = Pooling("pool5", in4e, Shape(3, 3), PoolingPoolType::kMax,
-                         false, false, PoolingPoolingConvention::kValid, Shape(2, 2));
-  Symbol in5a = InceptionFactory(pool5, 256, 160, 320, 32, 128, PoolingPoolType::kMax, 128, "in5a");
-  Symbol in5b = InceptionFactory(in5a, 384, 192, 384, 48, 128, PoolingPoolType::kMax, 128, "in5b");
-  Symbol pool6 = Pooling("pool6", in5b, Shape(7, 7), PoolingPoolType::kAvg,
-                         false, false, PoolingPoolingConvention::kValid, Shape(1, 1));
-
-  Symbol flatten = Flatten("flatten", pool6);
-
-  Symbol fc1_w("fc1_w"), fc1_b("fc1_b");
-  Symbol fc1 = FullyConnected("fc1", flatten, fc1_w, fc1_b, num_classes);
-
-  return SoftmaxOutput("softmax", fc1, data_label);
-}
-
-int main(int argc, char const *argv[]) {
-  int batch_size = 50;
-  int max_epoch = argc > 1 ? strtol(argv[1], nullptr, 10) : 100;
-  float learning_rate = 1e-4;
-  float weight_decay = 1e-4;
-
-  auto ctx = Context::gpu();
-#if MXNET_USE_CPU
-  ctx = Context::cpu();;
-#endif
-
-  TRY
-  auto googlenet = GoogleNetSymbol(10);
-  std::map<std::string, NDArray> args_map;
-  std::map<std::string, NDArray> aux_map;
-
-  args_map["data"] = NDArray(Shape(batch_size, 3, 256, 256), ctx);
-  args_map["data_label"] = NDArray(Shape(batch_size), ctx);
-  googlenet.InferArgsMap(ctx, &args_map, args_map);
-
-  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
-                                          "./data/mnist_data/train-labels-idx1-ubyte",
-                                          "./data/mnist_data/t10k-images-idx3-ubyte",
-                                          "./data/mnist_data/t10k-labels-idx1-ubyte"
-                                        };
-
-  auto train_iter =  MXDataIter("MNISTIter");
-  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
-    return 1;
-  }
-
-  auto val_iter = MXDataIter("MNISTIter");
-  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
-    return 1;
-  }
-
-  Optimizer* opt = OptimizerRegistry::Find("sgd");
-  opt->SetParam("momentum", 0.9)
-     ->SetParam("rescale_grad", 1.0 / batch_size)
-     ->SetParam("clip_gradient", 10)
-     ->SetParam("lr", learning_rate)
-     ->SetParam("wd", weight_decay);
-
-
-  auto *exec = googlenet.SimpleBind(ctx, args_map);
-  auto arg_names = googlenet.ListArguments();
-
-  for (int iter = 0; iter < max_epoch; ++iter) {
-    LG << "Epoch: " << iter;
-    train_iter.Reset();
-    while (train_iter.Next()) {
-      auto data_batch = train_iter.GetDataBatch();
-      data_batch.data.CopyTo(&args_map["data"]);
-      data_batch.label.CopyTo(&args_map["data_label"]);
-      NDArray::WaitAll();
-      exec->Forward(true);
-      exec->Backward();
-      for (size_t i = 0; i < arg_names.size(); ++i) {
-        if (arg_names[i] == "data" || arg_names[i] == "data_label") continue;
-        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
-      }
-    }
-
-    Accuracy acu;
-    val_iter.Reset();
-    while (val_iter.Next()) {
-      auto data_batch = val_iter.GetDataBatch();
-      data_batch.data.CopyTo(&args_map["data"]);
-      data_batch.label.CopyTo(&args_map["data_label"]);
-      NDArray::WaitAll();
-      exec->Forward(false);
-      NDArray::WaitAll();
-      acu.Update(data_batch.label, exec->outputs[0]);
-    }
-    LG << "Accuracy: " << acu.Get();
-  }
-
-  delete exec;
-  delete opt;
-  MXNotifyShutdown();
-  CATCH
-  return 0;
-}
diff --git a/cpp-package/example/inception_bn.cpp b/cpp-package/example/inception_bn.cpp
deleted file mode 100644
index 8fe6b070497c..000000000000
--- a/cpp-package/example/inception_bn.cpp
+++ /dev/null
@@ -1,261 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- */
-#include <map>
-#include <string>
-#include <fstream>
-#include <vector>
-#include "utils.h"
-#include "mxnet-cpp/MxNetCpp.h"
-
-using namespace mxnet::cpp;
-
-Symbol ConvFactoryBN(Symbol data, int num_filter,
-                     Shape kernel, Shape stride, Shape pad,
-                     const std::string & name,
-                     const std::string & suffix = "") {
-  Symbol conv_w("conv_" + name + suffix + "_w"), conv_b("conv_" + name + suffix + "_b");
-
-  Symbol conv = Convolution("conv_" + name + suffix, data,
-                            conv_w, conv_b, kernel,
-                            num_filter, stride, Shape(1, 1), pad);
-  std::string name_suffix = name + suffix;
-  Symbol gamma(name_suffix + "_gamma");
-  Symbol beta(name_suffix + "_beta");
-  Symbol mmean(name_suffix + "_mmean");
-  Symbol mvar(name_suffix + "_mvar");
-  Symbol bn = BatchNorm("bn_" + name + suffix, conv, gamma, beta, mmean, mvar);
-  return Activation("relu_" + name + suffix, bn, "relu");
-}
-
-Symbol InceptionFactoryA(Symbol data, int num_1x1, int num_3x3red,
-                         int num_3x3, int num_d3x3red, int num_d3x3,
-                         PoolingPoolType pool, int proj,
-                         const std::string & name) {
-  Symbol c1x1 = ConvFactoryBN(data, num_1x1, Shape(1, 1), Shape(1, 1),
-                              Shape(0, 0), name + "1x1");
-  Symbol c3x3r = ConvFactoryBN(data, num_3x3red, Shape(1, 1), Shape(1, 1),
-                               Shape(0, 0), name + "_3x3r");
-  Symbol c3x3 = ConvFactoryBN(c3x3r, num_3x3, Shape(3, 3), Shape(1, 1),
-                              Shape(1, 1), name + "_3x3");
-  Symbol cd3x3r = ConvFactoryBN(data, num_d3x3red, Shape(1, 1), Shape(1, 1),
-                                Shape(0, 0), name + "_double_3x3", "_reduce");
-  Symbol cd3x3 = ConvFactoryBN(cd3x3r, num_d3x3, Shape(3, 3), Shape(1, 1),
-                               Shape(1, 1), name + "_double_3x3_0");
-  cd3x3 = ConvFactoryBN(data = cd3x3, num_d3x3, Shape(3, 3), Shape(1, 1),
-                        Shape(1, 1), name + "_double_3x3_1");
-  Symbol pooling = Pooling(name + "_pool", data,
-                           Shape(3, 3), pool, false, false,
-                           PoolingPoolingConvention::kValid,
-                           Shape(1, 1), Shape(1, 1));
-  Symbol cproj = ConvFactoryBN(pooling, proj, Shape(1, 1), Shape(1, 1),
-                               Shape(0, 0), name + "_proj");
-  std::vector<Symbol> lst;
-  lst.push_back(c1x1);
-  lst.push_back(c3x3);
-  lst.push_back(cd3x3);
-  lst.push_back(cproj);
-  return Concat("ch_concat_" + name + "_chconcat", lst, lst.size());
-}
-
-Symbol InceptionFactoryB(Symbol data, int num_3x3red, int num_3x3,
-                         int num_d3x3red, int num_d3x3, const std::string & name) {
-  Symbol c3x3r = ConvFactoryBN(data, num_3x3red, Shape(1, 1),
-                               Shape(1, 1), Shape(0, 0),
-                               name + "_3x3", "_reduce");
-  Symbol c3x3 = ConvFactoryBN(c3x3r, num_3x3, Shape(3, 3), Shape(2, 2),
-                              Shape(1, 1), name + "_3x3");
-  Symbol cd3x3r = ConvFactoryBN(data, num_d3x3red, Shape(1, 1), Shape(1, 1),
-                                Shape(0, 0), name + "_double_3x3", "_reduce");
-  Symbol cd3x3 = ConvFactoryBN(cd3x3r, num_d3x3, Shape(3, 3), Shape(1, 1),
-                               Shape(1, 1), name + "_double_3x3_0");
-  cd3x3 = ConvFactoryBN(cd3x3, num_d3x3, Shape(3, 3), Shape(2, 2),
-                        Shape(1, 1), name + "_double_3x3_1");
-  Symbol pooling = Pooling("max_pool_" + name + "_pool", data,
-                           Shape(3, 3), PoolingPoolType::kMax,
-                           false, false, PoolingPoolingConvention::kValid,
-                           Shape(2, 2), Shape(1, 1));
-  std::vector<Symbol> lst;
-  lst.push_back(c3x3);
-  lst.push_back(cd3x3);
-  lst.push_back(pooling);
-  return Concat("ch_concat_" + name + "_chconcat", lst, lst.size());
-}
-
-Symbol InceptionSymbol(int num_classes) {
-  // data and label
-  Symbol data = Symbol::Variable("data");
-  Symbol data_label = Symbol::Variable("data_label");
-
-  // stage 1
-  Symbol conv1 = ConvFactoryBN(data, 64, Shape(7, 7), Shape(2, 2), Shape(3, 3), "conv1");
-  Symbol pool1 = Pooling("pool1", conv1, Shape(3, 3), PoolingPoolType::kMax,
-      false, false, PoolingPoolingConvention::kValid, Shape(2, 2));
-
-  // stage 2
-  Symbol conv2red = ConvFactoryBN(pool1, 64, Shape(1, 1), Shape(1, 1),  Shape(0, 0), "conv2red");
-  Symbol conv2 = ConvFactoryBN(conv2red, 192, Shape(3, 3), Shape(1, 1), Shape(1, 1), "conv2");
-  Symbol pool2 = Pooling("pool2", conv2, Shape(3, 3), PoolingPoolType::kMax,
-      false, false, PoolingPoolingConvention::kValid, Shape(2, 2));
-
-  // stage 3
-  Symbol in3a = InceptionFactoryA(pool2, 64, 64, 64, 64, 96, PoolingPoolType::kAvg, 32, "3a");
-  Symbol in3b = InceptionFactoryA(in3a, 64, 64, 96, 64, 96, PoolingPoolType::kAvg, 64, "3b");
-  Symbol in3c = InceptionFactoryB(in3b, 128, 160, 64, 96, "3c");
-
-  // stage 4
-  Symbol in4a = InceptionFactoryA(in3c, 224, 64, 96, 96, 128, PoolingPoolType::kAvg, 128, "4a");
-  Symbol in4b = InceptionFactoryA(in4a, 192, 96, 128, 96, 128,  PoolingPoolType::kAvg, 128, "4b");
-  Symbol in4c = InceptionFactoryA(in4b, 160, 128, 160, 128, 160, PoolingPoolType::kAvg, 128, "4c");
-  Symbol in4d = InceptionFactoryA(in4c, 96, 128, 192, 160, 192,  PoolingPoolType::kAvg, 128, "4d");
-  Symbol in4e = InceptionFactoryB(in4d, 128, 192, 192, 256, "4e");
-
-  // stage 5
-  Symbol in5a = InceptionFactoryA(in4e, 352, 192, 320, 160, 224, PoolingPoolType::kAvg, 128, "5a");
-  Symbol in5b = InceptionFactoryA(in5a, 352, 192, 320, 192, 224, PoolingPoolType::kMax, 128, "5b");
-
-  // average pooling
-  Symbol avg = Pooling("global_pool", in5b, Shape(7, 7), PoolingPoolType::kAvg);
-
-  // classifier
-  Symbol flatten = Flatten("flatten", avg);
-  Symbol conv1_w("conv1_w"), conv1_b("conv1_b");
-  Symbol fc1 = FullyConnected("fc1", flatten, conv1_w, conv1_b, num_classes);
-  return SoftmaxOutput("softmax", fc1, data_label);
-}
-
-NDArray ResizeInput(NDArray data, const Shape new_shape) {
-  NDArray pic = data.Reshape(Shape(0, 1, 28, 28));
-  NDArray pic_1channel;
-  Operator("_contrib_BilinearResize2D")
-    .SetParam("height", new_shape[2])
-    .SetParam("width", new_shape[3])
-    (pic).Invoke(pic_1channel);
-  NDArray output;
-  Operator("tile")
-    .SetParam("reps", Shape(1, 3, 1, 1))
-    (pic_1channel).Invoke(output);
-  return output;
-}
-
-int main(int argc, char const *argv[]) {
-  int batch_size = 40;
-  int max_epoch = argc > 1 ? strtol(argv[1], nullptr, 10) : 100;
-  float learning_rate = 1e-2;
-  float weight_decay = 1e-4;
-
-  /*context*/
-  auto ctx = Context::cpu();
-  int num_gpu;
-  MXGetGPUCount(&num_gpu);
-#if !MXNET_USE_CPU
-  if (num_gpu > 0) {
-    ctx = Context::gpu();
-  }
-#endif
-
-  TRY
-  auto inception_bn_net = InceptionSymbol(10);
-  std::map<std::string, NDArray> args_map;
-  std::map<std::string, NDArray> aux_map;
-
-  const Shape data_shape = Shape(batch_size, 3, 224, 224),
-              label_shape = Shape(batch_size);
-  args_map["data"] = NDArray(data_shape, ctx);
-  args_map["data_label"] = NDArray(label_shape, ctx);
-  inception_bn_net.InferArgsMap(ctx, &args_map, args_map);
-
-  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
-                                          "./data/mnist_data/train-labels-idx1-ubyte",
-                                          "./data/mnist_data/t10k-images-idx3-ubyte",
-                                          "./data/mnist_data/t10k-labels-idx1-ubyte"
-                                        };
-
-  auto train_iter =  MXDataIter("MNISTIter");
-  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
-    return 1;
-  }
-
-  auto val_iter = MXDataIter("MNISTIter");
-  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
-    return 1;
-  }
-
-  // initialize parameters
-  Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2);
-  for (auto &arg : args_map) {
-    xavier(arg.first, &arg.second);
-  }
-
-  Optimizer* opt = OptimizerRegistry::Find("sgd");
-  opt->SetParam("momentum", 0.9)
-     ->SetParam("rescale_grad", 1.0 / batch_size)
-     ->SetParam("clip_gradient", 10)
-     ->SetParam("lr", learning_rate)
-     ->SetParam("wd", weight_decay);
-
-  auto *exec = inception_bn_net.SimpleBind(ctx, args_map);
-  auto arg_names = inception_bn_net.ListArguments();
-
-  // Create metrics
-  Accuracy train_acc, val_acc;
-  for (int iter = 0; iter < max_epoch; ++iter) {
-    LG << "Epoch: " << iter;
-    train_iter.Reset();
-    train_acc.Reset();
-    while (train_iter.Next()) {
-      auto data_batch = train_iter.GetDataBatch();
-      ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]);
-      data_batch.label.CopyTo(&args_map["data_label"]);
-      NDArray::WaitAll();
-
-      exec->Forward(true);
-      exec->Backward();
-      // Update parameters
-      for (size_t i = 0; i < arg_names.size(); ++i) {
-        if (arg_names[i] == "data" || arg_names[i] == "data_label") continue;
-        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
-      }
-
-      NDArray::WaitAll();
-      train_acc.Update(data_batch.label, exec->outputs[0]);
-    }
-
-    val_iter.Reset();
-    val_acc.Reset();
-    while (val_iter.Next()) {
-      auto data_batch = val_iter.GetDataBatch();
-      ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]);
-      data_batch.label.CopyTo(&args_map["data_label"]);
-      NDArray::WaitAll();
-      exec->Forward(false);
-      NDArray::WaitAll();
-      val_acc.Update(data_batch.label, exec->outputs[0]);
-    }
-    LG << "Train Accuracy: " << train_acc.Get();
-    LG << "Validation Accuracy: " << val_acc.Get();
-  }
-  delete exec;
-  delete opt;
-  MXNotifyShutdown();
-  CATCH
-  return 0;
-}
diff --git a/cpp-package/example/lenet.cpp b/cpp-package/example/lenet.cpp
deleted file mode 100644
index 3e34dbb486ab..000000000000
--- a/cpp-package/example/lenet.cpp
+++ /dev/null
@@ -1,267 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- */
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-#include <cstdlib>
-#include "mxnet-cpp/MxNetCpp.h"
-#include "utils.h"
-
-using namespace mxnet::cpp;
-
-class Lenet {
- public:
-  Lenet()
-      : ctx_cpu(Context(DeviceType::kCPU, 0)),
-#if MXNET_USE_CPU
-        ctx_dev(Context(DeviceType::kCPU, 0))
-#else
-        ctx_dev(Context(DeviceType::kGPU, 0))
-#endif
-        {}
-
-  void Run(int max_epoch) {
-    /*
-     * LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner.
-     * "Gradient-based learning applied to document recognition."
-     * Proceedings of the IEEE (1998)
-     * */
-
-    /*define the symbolic net*/
-    Symbol data = Symbol::Variable("data");
-    Symbol data_label = Symbol::Variable("data_label");
-    Symbol conv1_w("conv1_w"), conv1_b("conv1_b");
-    Symbol conv2_w("conv2_w"), conv2_b("conv2_b");
-    Symbol conv3_w("conv3_w"), conv3_b("conv3_b");
-    Symbol fc1_w("fc1_w"), fc1_b("fc1_b");
-    Symbol fc2_w("fc2_w"), fc2_b("fc2_b");
-
-    Symbol conv1 =
-        Convolution("conv1", data, conv1_w, conv1_b, Shape(5, 5), 20);
-    Symbol tanh1 = Activation("tanh1", conv1, ActivationActType::kTanh);
-    Symbol pool1 = Pooling("pool1", tanh1, Shape(2, 2), PoolingPoolType::kMax,
-      false, false, PoolingPoolingConvention::kValid, Shape(2, 2));
-
-    Symbol conv2 = Convolution("conv2", pool1, conv2_w, conv2_b,
-      Shape(5, 5), 50);
-    Symbol tanh2 = Activation("tanh2", conv2, ActivationActType::kTanh);
-    Symbol pool2 = Pooling("pool2", tanh2, Shape(2, 2), PoolingPoolType::kMax,
-      false, false, PoolingPoolingConvention::kValid, Shape(2, 2));
-
-    Symbol conv3 = Convolution("conv3", pool2, conv3_w, conv3_b,
-      Shape(2, 2), 500);
-    Symbol tanh3 = Activation("tanh3", conv3, ActivationActType::kTanh);
-    Symbol pool3 = Pooling("pool3", tanh3, Shape(2, 2), PoolingPoolType::kMax,
-      false, false, PoolingPoolingConvention::kValid, Shape(1, 1));
-
-    Symbol flatten = Flatten("flatten", pool3);
-    Symbol fc1 = FullyConnected("fc1", flatten, fc1_w, fc1_b, 500);
-    Symbol tanh4 = Activation("tanh4", fc1, ActivationActType::kTanh);
-    Symbol fc2 = FullyConnected("fc2", tanh4, fc2_w, fc2_b, 10);
-
-    Symbol lenet = SoftmaxOutput("softmax", fc2, data_label);
-
-    for (auto s : lenet.ListArguments()) {
-      LG << s;
-    }
-
-    /*setup basic configs*/
-    int val_fold = 1;
-    int W = 28;
-    int H = 28;
-    int batch_size = 42;
-    float learning_rate = 1e-4;
-    float weight_decay = 1e-4;
-
-    /*prepare the data*/
-    std::vector<float> data_vec, label_vec;
-    size_t data_count = GetData(&data_vec, &label_vec);
-    const float *dptr = data_vec.data();
-    const float *lptr = label_vec.data();
-    NDArray data_array = NDArray(Shape(data_count, 1, W, H), ctx_cpu,
-                                 false);  // store in main memory, and copy to
-    // device memory while training
-    NDArray label_array =
-      NDArray(Shape(data_count), ctx_cpu,
-                false);  // it's also ok if just store them all in device memory
-    data_array.SyncCopyFromCPU(dptr, data_count * W * H);
-    label_array.SyncCopyFromCPU(lptr, data_count);
-    data_array.WaitToRead();
-    label_array.WaitToRead();
-
-    size_t train_num = data_count * (1 - val_fold / 10.0);
-    train_data = data_array.Slice(0, train_num);
-    train_label = label_array.Slice(0, train_num);
-    val_data = data_array.Slice(train_num, data_count);
-    val_label = label_array.Slice(train_num, data_count);
-
-    LG << "here read fin";
-
-    /*init some of the args*/
-    // map<string, NDArray> args_map;
-    args_map["data"] = data_array.Slice(0, batch_size).Copy(ctx_dev);
-    args_map["data_label"] = label_array.Slice(0, batch_size).Copy(ctx_dev);
-    NDArray::WaitAll();
-
-    LG << "here slice fin";
-    /*
-     * we can also feed in some of the args other than the input all by
-     * ourselves,
-     * fc2-w , fc1-b for example:
-     * */
-    // args_map["fc2_w"] =
-    // NDArray(mshadow::Shape2(500, 4 * 4 * 50), ctx_dev, false);
-    // NDArray::SampleGaussian(0, 1, &args_map["fc2_w"]);
-    // args_map["fc1_b"] = NDArray(mshadow::Shape1(10), ctx_dev, false);
-    // args_map["fc1_b"] = 0;
-
-    lenet.InferArgsMap(ctx_dev, &args_map, args_map);
-    Optimizer* opt = OptimizerRegistry::Find("sgd");
-    opt->SetParam("momentum", 0.9)
-       ->SetParam("rescale_grad", 1.0)
-       ->SetParam("clip_gradient", 10)
-       ->SetParam("lr", learning_rate)
-       ->SetParam("wd", weight_decay);
-
-    Executor *exe = lenet.SimpleBind(ctx_dev, args_map);
-    auto arg_names = lenet.ListArguments();
-
-    for (int ITER = 0; ITER < max_epoch; ++ITER) {
-      size_t start_index = 0;
-      while (start_index < train_num) {
-        if (start_index + batch_size > train_num) {
-          start_index = train_num - batch_size;
-        }
-        args_map["data"] =
-            train_data.Slice(start_index, start_index + batch_size)
-                .Copy(ctx_dev);
-        args_map["data_label"] =
-            train_label.Slice(start_index, start_index + batch_size)
-                .Copy(ctx_dev);
-        start_index += batch_size;
-        NDArray::WaitAll();
-
-        exe->Forward(true);
-        exe->Backward();
-        // Update parameters
-        for (size_t i = 0; i < arg_names.size(); ++i) {
-          if (arg_names[i] == "data" || arg_names[i] == "data_label") continue;
-          opt->Update(i, exe->arg_arrays[i], exe->grad_arrays[i]);
-        }
-      }
-
-      LG << "Iter " << ITER
-         << ", accuracy: " << ValAccuracy(batch_size * 10, lenet);
-    }
-    delete exe;
-    delete opt;
-  }
-
- private:
-  Context ctx_cpu;
-  Context ctx_dev;
-  std::map<std::string, NDArray> args_map;
-  NDArray train_data;
-  NDArray train_label;
-  NDArray val_data;
-  NDArray val_label;
-
-  size_t GetData(std::vector<float> *data, std::vector<float> *label) {
-    const char *train_data_path = "./data/mnist_data/mnist_train.csv";
-    std::ifstream inf(train_data_path);
-    std::string line;
-    inf >> line;  // ignore the header
-    size_t _N = 0;
-    while (inf >> line) {
-      for (auto &c : line) c = (c == ',') ? ' ' : c;
-      std::stringstream ss;
-      ss << line;
-      float _data;
-      ss >> _data;
-      label->push_back(_data);
-      while (ss >> _data) data->push_back(_data / 256.0);
-      _N++;
-    }
-    inf.close();
-    return _N;
-  }
-
-  float ValAccuracy(int batch_size, Symbol lenet) {
-    size_t val_num = val_data.GetShape()[0];
-
-    size_t correct_count = 0;
-    size_t all_count = 0;
-
-    size_t start_index = 0;
-    while (start_index < val_num) {
-      if (start_index + batch_size > val_num) {
-        start_index = val_num - batch_size;
-      }
-      args_map["data"] =
-          val_data.Slice(start_index, start_index + batch_size).Copy(ctx_dev);
-      args_map["data_label"] =
-          val_label.Slice(start_index, start_index + batch_size).Copy(ctx_dev);
-      start_index += batch_size;
-      NDArray::WaitAll();
-
-      Executor *exe = lenet.SimpleBind(ctx_dev, args_map);
-      exe->Forward(false);
-
-      const auto &out = exe->outputs;
-      NDArray out_cpu = out[0].Copy(ctx_cpu);
-      NDArray label_cpu =
-          val_label.Slice(start_index - batch_size, start_index).Copy(ctx_cpu);
-
-      NDArray::WaitAll();
-
-      const mx_float *dptr_out = out_cpu.GetData();
-      const mx_float *dptr_label = label_cpu.GetData();
-      for (int i = 0; i < batch_size; ++i) {
-        float label = dptr_label[i];
-        int cat_num = out_cpu.GetShape()[1];
-        float p_label = 0, max_p = dptr_out[i * cat_num];
-        for (int j = 0; j < cat_num; ++j) {
-          float p = dptr_out[i * cat_num + j];
-          if (max_p < p) {
-            p_label = j;
-            max_p = p;
-          }
-        }
-        if (label == p_label) correct_count++;
-      }
-      all_count += batch_size;
-
-      delete exe;
-    }
-    return correct_count * 1.0 / all_count;
-  }
-};
-
-int main(int argc, char const *argv[]) {
-  TRY
-  Lenet lenet;
-  lenet.Run(argc > 1 ? strtol(argv[1], nullptr, 10) : 100000);
-  MXNotifyShutdown();
-  CATCH
-  return 0;
-}
diff --git a/cpp-package/example/lenet_with_mxdataiter.cpp b/cpp-package/example/lenet_with_mxdataiter.cpp
deleted file mode 100644
index 6b37693cda59..000000000000
--- a/cpp-package/example/lenet_with_mxdataiter.cpp
+++ /dev/null
@@ -1,203 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- */
-#include <map>
-#include <string>
-#include <vector>
-#include <fstream>
-#include <chrono>
-#include <cstdlib>
-#include "utils.h"
-#include "mxnet-cpp/MxNetCpp.h"
-
-using namespace mxnet::cpp;
-
-Symbol LenetSymbol() {
-  /*
-   * LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick Haffner.
-   * "Gradient-based learning applied to document recognition."
-   * Proceedings of the IEEE (1998)
-   * */
-
-  /*define the symbolic net*/
-  Symbol data = Symbol::Variable("data");
-  Symbol data_label = Symbol::Variable("data_label");
-  Symbol conv1_w("conv1_w"), conv1_b("conv1_b");
-  Symbol conv2_w("conv2_w"), conv2_b("conv2_b");
-  Symbol conv3_w("conv3_w"), conv3_b("conv3_b");
-  Symbol fc1_w("fc1_w"), fc1_b("fc1_b");
-  Symbol fc2_w("fc2_w"), fc2_b("fc2_b");
-
-  Symbol conv1 = Convolution("conv1", data, conv1_w, conv1_b, Shape(5, 5), 20);
-  Symbol tanh1 = Activation("tanh1", conv1, ActivationActType::kTanh);
-  Symbol pool1 = Pooling("pool1", tanh1, Shape(2, 2), PoolingPoolType::kMax,
-      false, false, PoolingPoolingConvention::kValid, Shape(2, 2));
-
-  Symbol conv2 = Convolution("conv2", pool1, conv2_w, conv2_b, Shape(5, 5), 50);
-  Symbol tanh2 = Activation("tanh2", conv2, ActivationActType::kTanh);
-  Symbol pool2 = Pooling("pool2", tanh2, Shape(2, 2), PoolingPoolType::kMax,
-      false, false, PoolingPoolingConvention::kValid, Shape(2, 2));
-
-  Symbol flatten = Flatten("flatten", pool2);
-  Symbol fc1 = FullyConnected("fc1", flatten, fc1_w, fc1_b, 500);
-  Symbol tanh3 = Activation("tanh3", fc1, ActivationActType::kTanh);
-  Symbol fc2 = FullyConnected("fc2", tanh3, fc2_w, fc2_b, 10);
-
-  Symbol lenet = SoftmaxOutput("softmax", fc2, data_label);
-
-  return lenet;
-}
-
-NDArray ResizeInput(NDArray data, const Shape new_shape) {
-  NDArray pic = data.Reshape(Shape(0, 1, 28, 28));
-  NDArray output;
-  Operator("_contrib_BilinearResize2D")
-    .SetParam("height", new_shape[2])
-    .SetParam("width", new_shape[3])
-    (pic).Invoke(output);
-  return output;
-}
-
-int main(int argc, char const *argv[]) {
-  /*setup basic configs*/
-  int W = 28;
-  int H = 28;
-  int batch_size = 128;
-  int max_epoch = argc > 1 ? strtol(argv[1], nullptr, 10) : 100;
-  float learning_rate = 1e-4;
-  float weight_decay = 1e-4;
-
-  auto dev_ctx = Context::cpu();
-  int num_gpu;
-  MXGetGPUCount(&num_gpu);
-#if !MXNET_USE_CPU
-  if (num_gpu > 0) {
-    dev_ctx = Context::gpu();
-  }
-#endif
-
-  TRY
-  auto lenet = LenetSymbol();
-  std::map<std::string, NDArray> args_map;
-
-  const Shape data_shape = Shape(batch_size, 1, H, W),
-              label_shape = Shape(batch_size);
-  args_map["data"] = NDArray(data_shape, dev_ctx);
-  args_map["data_label"] = NDArray(label_shape, dev_ctx);
-  lenet.InferArgsMap(dev_ctx, &args_map, args_map);
-
-  args_map["fc1_w"] = NDArray(Shape(500, 4 * 4 * 50), dev_ctx);
-  NDArray::SampleGaussian(0, 1, &args_map["fc1_w"]);
-  args_map["fc2_b"] = NDArray(Shape(10), dev_ctx);
-  args_map["fc2_b"] = 0;
-
-  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
-                                          "./data/mnist_data/train-labels-idx1-ubyte",
-                                          "./data/mnist_data/t10k-images-idx3-ubyte",
-                                          "./data/mnist_data/t10k-labels-idx1-ubyte"
-                                        };
-
-  auto train_iter =  MXDataIter("MNISTIter");
-  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
-    return 1;
-  }
-
-  auto val_iter = MXDataIter("MNISTIter");
-  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
-    return 1;
-  }
-
-  Optimizer* opt = OptimizerRegistry::Find("sgd");
-  opt->SetParam("momentum", 0.9)
-     ->SetParam("rescale_grad", 1.0)
-     ->SetParam("clip_gradient", 10)
-     ->SetParam("lr", learning_rate)
-     ->SetParam("wd", weight_decay);
-
-
-  auto *exec = lenet.SimpleBind(dev_ctx, args_map);
-  auto arg_names = lenet.ListArguments();
-
-  // Create metrics
-  Accuracy train_acc, val_acc;
-
-  for (int iter = 0; iter < max_epoch; ++iter) {
-      int samples = 0;
-      train_iter.Reset();
-      train_acc.Reset();
-
-      auto tic = std::chrono::system_clock::now();
-
-     while (train_iter.Next()) {
-      samples += batch_size;
-      auto data_batch = train_iter.GetDataBatch();
-
-      ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]);
-      data_batch.label.CopyTo(&args_map["data_label"]);
-      NDArray::WaitAll();
-
-      // Compute gradients
-      exec->Forward(true);
-      exec->Backward();
-
-      // Update parameters
-      for (size_t i = 0; i < arg_names.size(); ++i) {
-        if (arg_names[i] == "data" || arg_names[i] == "data_label") continue;
-        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
-      }
-
-      // Update metric
-      train_acc.Update(data_batch.label, exec->outputs[0]);
-    }
-
-     // one epoch of training is finished
-     auto toc = std::chrono::system_clock::now();
-     float duration = std::chrono::duration_cast<std::chrono::milliseconds>
-                      (toc - tic).count() / 1000.0;
-     LG << "Epoch[" << iter << "] " << samples / duration \
-         << " samples/sec " << "Train-Accuracy=" << train_acc.Get();;
-
-      val_iter.Reset();
-      val_acc.Reset();
-
-    Accuracy acu;
-    val_iter.Reset();
-    while (val_iter.Next()) {
-      auto data_batch = val_iter.GetDataBatch();
-      ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]);
-      data_batch.label.CopyTo(&args_map["data_label"]);
-      NDArray::WaitAll();
-
-      // Only forward pass is enough as no gradient is needed when evaluating
-      exec->Forward(false);
-      NDArray::WaitAll();
-      acu.Update(data_batch.label, exec->outputs[0]);
-      val_acc.Update(data_batch.label, exec->outputs[0]);
-    }
-    LG << "Epoch[" << iter << "] Val-Accuracy=" << val_acc.Get();
-  }
-
-  delete exec;
-  delete opt;
-  MXNotifyShutdown();
-  CATCH
-  return 0;
-}
diff --git a/cpp-package/example/mlp.cpp b/cpp-package/example/mlp.cpp
deleted file mode 100644
index 970dad74e727..000000000000
--- a/cpp-package/example/mlp.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- */
-
-#include <iostream>
-#include <vector>
-#include <string>
-#include "mxnet-cpp/MxNetCpp.h"
-#include "utils.h"
-
-using namespace mxnet::cpp;
-
-/*
- * In this example,
- * we make by hand some data in 10 classes with some pattern
- * and try to use MLP to recognize the pattern.
- */
-
-void OutputAccuracy(mx_float* pred, mx_float* target) {
-  int right = 0;
-  for (int i = 0; i < 128; ++i) {
-    float mx_p = pred[i * 10 + 0];
-    float p_y = 0;
-    for (int j = 0; j < 10; ++j) {
-      if (pred[i * 10 + j] > mx_p) {
-        mx_p = pred[i * 10 + j];
-        p_y = j;
-      }
-    }
-    if (p_y == target[i]) right++;
-  }
-  std::cout << "Accuracy: " << right / 128.0 << std::endl;
-}
-
-void MLP(int max_epoch) {
-  auto sym_x = Symbol::Variable("X");
-  auto sym_label = Symbol::Variable("label");
-
-  const int nLayers = 2;
-  std::vector<int> layerSizes({512, 10});
-  std::vector<Symbol> weights(nLayers);
-  std::vector<Symbol> biases(nLayers);
-  std::vector<Symbol> outputs(nLayers);
-
-  Symbol null_sym;
-  for (int i = 0; i < nLayers; i++) {
-    std::string istr = std::to_string(i);
-    weights[i] = Symbol::Variable(std::string("w") + istr);
-    biases[i] = Symbol::Variable(std::string("b") + istr);
-    Symbol fc = FullyConnected(std::string("fc") + istr,
-      i == 0? sym_x : outputs[i-1],
-      weights[i], biases[i], layerSizes[i]);
-    outputs[i] = LeakyReLU(std::string("act") + istr, fc, null_sym, LeakyReLUActType::kLeaky);
-  }
-  auto sym_out = SoftmaxOutput("softmax", outputs[nLayers - 1], sym_label);
-
-  Context ctx_dev(DeviceType::kCPU, 0);
-
-  NDArray array_x(Shape(128, 28), ctx_dev, false);
-  NDArray array_y(Shape(128), ctx_dev, false);
-
-  mx_float* aptr_x = new mx_float[128 * 28];
-  mx_float* aptr_y = new mx_float[128];
-
-  // we make the data by hand, in 10 classes, with some pattern
-  for (int i = 0; i < 128; i++) {
-    for (int j = 0; j < 28; j++) {
-      aptr_x[i * 28 + j] = i % 10 * 1.0f;
-    }
-    aptr_y[i] = i % 10;
-  }
-  array_x.SyncCopyFromCPU(aptr_x, 128 * 28);
-  array_x.WaitToRead();
-  array_y.SyncCopyFromCPU(aptr_y, 128);
-  array_y.WaitToRead();
-
-  // init the parameters
-  NDArray array_w_1(Shape(512, 28), ctx_dev, false);
-  NDArray array_b_1(Shape(512), ctx_dev, false);
-  NDArray array_w_2(Shape(10, 512), ctx_dev, false);
-  NDArray array_b_2(Shape(10), ctx_dev, false);
-
-  // the parameters should be initialized in some kind of distribution,
-  // so it learns fast
-  // but here just give a const value by hand
-  array_w_1 = 0.5f;
-  array_b_1 = 0.0f;
-  array_w_2 = 0.5f;
-  array_b_2 = 0.0f;
-
-  // the grads
-  NDArray array_w_1_g(Shape(512, 28), ctx_dev, false);
-  NDArray array_b_1_g(Shape(512), ctx_dev, false);
-  NDArray array_w_2_g(Shape(10, 512), ctx_dev, false);
-  NDArray array_b_2_g(Shape(10), ctx_dev, false);
-
-  // Bind the symolic network with the ndarray
-  // all the input args
-  std::vector<NDArray> in_args;
-  in_args.push_back(array_x);
-  in_args.push_back(array_w_1);
-  in_args.push_back(array_b_1);
-  in_args.push_back(array_w_2);
-  in_args.push_back(array_b_2);
-  in_args.push_back(array_y);
-  // all the grads
-  std::vector<NDArray> arg_grad_store;
-  arg_grad_store.push_back(NDArray());  // we don't need the grad of the input
-  arg_grad_store.push_back(array_w_1_g);
-  arg_grad_store.push_back(array_b_1_g);
-  arg_grad_store.push_back(array_w_2_g);
-  arg_grad_store.push_back(array_b_2_g);
-  arg_grad_store.push_back(
-      NDArray());  // neither do we need the grad of the loss
-  // how to handle the grad
-  std::vector<OpReqType> grad_req_type;
-  grad_req_type.push_back(kNullOp);
-  grad_req_type.push_back(kWriteTo);
-  grad_req_type.push_back(kWriteTo);
-  grad_req_type.push_back(kWriteTo);
-  grad_req_type.push_back(kWriteTo);
-  grad_req_type.push_back(kNullOp);
-  std::vector<NDArray> aux_states;
-
-  std::cout << "make the Executor" << std::endl;
-  Executor* exe = new Executor(sym_out, ctx_dev, in_args, arg_grad_store,
-                               grad_req_type, aux_states);
-
-  std::cout << "Training" << std::endl;
-  mx_float learning_rate = 0.0001;
-  for (int epoch_num = 0; epoch_num < max_epoch; ++epoch_num) {
-    exe->Forward(true);
-    // print accuracy every 100 epoch
-    if (epoch_num % 100 == 0) {
-      std::cout << "epoch " << epoch_num << std::endl;
-      std::vector<NDArray>& out = exe->outputs;
-      float* cptr = new float[128 * 10];
-      out[0].SyncCopyToCPU(cptr, 128 * 10);
-      NDArray::WaitAll();
-      OutputAccuracy(cptr, aptr_y);
-      delete[] cptr;
-    }
-
-    // update the parameters
-    exe->Backward();
-    for (int i = 1; i < 5; ++i) {
-      in_args[i] -= arg_grad_store[i] * learning_rate;
-    }
-    NDArray::WaitAll();
-  }
-
-  delete exe;
-  delete[] aptr_x;
-  delete[] aptr_y;
-}
-
-int main(int argc, char** argv) {
-  int max_epoch = argc > 1 ? strtol(argv[1], nullptr, 10) : 15000;
-  TRY
-  MLP(max_epoch);
-  MXNotifyShutdown();
-  CATCH
-  return 0;
-}
diff --git a/cpp-package/example/mlp_cpu.cpp b/cpp-package/example/mlp_cpu.cpp
deleted file mode 100644
index 7ea6946dd8c2..000000000000
--- a/cpp-package/example/mlp_cpu.cpp
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Xin Li yakumolx@gmail.com
- */
-#include <chrono>
-#include "utils.h"
-#include "mxnet-cpp/MxNetCpp.h"
-
-using namespace mxnet::cpp;
-
-Symbol mlp(const std::vector<int> &layers) {
-  auto x = Symbol::Variable("X");
-  auto label = Symbol::Variable("label");
-
-  std::vector<Symbol> weights(layers.size());
-  std::vector<Symbol> biases(layers.size());
-  std::vector<Symbol> outputs(layers.size());
-
-  for (size_t i = 0; i < layers.size(); ++i) {
-    weights[i] = Symbol::Variable("w" + std::to_string(i));
-    biases[i] = Symbol::Variable("b" + std::to_string(i));
-    Symbol fc = FullyConnected(
-      i == 0? x : outputs[i-1],  // data
-      weights[i],
-      biases[i],
-      layers[i]);
-    outputs[i] = i == layers.size()-1 ? fc : Activation(fc, ActivationActType::kRelu);
-  }
-
-  return SoftmaxOutput(outputs.back(), label);
-}
-
-int main(int argc, char** argv) {
-  const int image_size = 28;
-  const std::vector<int> layers{128, 64, 10};
-  const int batch_size = 100;
-  const int max_epoch = 10;
-  const float learning_rate = 0.1;
-  const float weight_decay = 1e-2;
-
-  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
-                                          "./data/mnist_data/train-labels-idx1-ubyte",
-                                          "./data/mnist_data/t10k-images-idx3-ubyte",
-                                          "./data/mnist_data/t10k-labels-idx1-ubyte"
-                                        };
-
-  auto train_iter =  MXDataIter("MNISTIter");
-  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
-    return 1;
-  }
-
-  auto val_iter = MXDataIter("MNISTIter");
-  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
-    return 1;
-  }
-
-  TRY
-  auto net = mlp(layers);
-
-  Context ctx = Context::cpu();  // Use CPU for training
-
-  std::map<std::string, NDArray> args;
-  args["X"] = NDArray(Shape(batch_size, image_size*image_size), ctx);
-  args["label"] = NDArray(Shape(batch_size), ctx);
-  // Let MXNet infer shapes other parameters such as weights
-  net.InferArgsMap(ctx, &args, args);
-
-  // Initialize all parameters with uniform distribution U(-0.01, 0.01)
-  auto initializer = Uniform(0.01);
-  for (auto& arg : args) {
-    // arg.first is parameter name, and arg.second is the value
-    initializer(arg.first, &arg.second);
-  }
-
-  // Create sgd optimizer
-  Optimizer* opt = OptimizerRegistry::Find("sgd");
-  opt->SetParam("rescale_grad", 1.0/batch_size)
-     ->SetParam("lr", learning_rate)
-     ->SetParam("wd", weight_decay);
-
-  // Create executor by binding parameters to the model
-  auto *exec = net.SimpleBind(ctx, args);
-  auto arg_names = net.ListArguments();
-
-  // Start training
-  for (int iter = 0; iter < max_epoch; ++iter) {
-    int samples = 0;
-    train_iter.Reset();
-
-    auto tic = std::chrono::system_clock::now();
-    while (train_iter.Next()) {
-      samples += batch_size;
-      auto data_batch = train_iter.GetDataBatch();
-      // Set data and label
-      data_batch.data.CopyTo(&args["X"]);
-      data_batch.label.CopyTo(&args["label"]);
-
-      // Compute gradients
-      exec->Forward(true);
-      exec->Backward();
-      // Update parameters
-      for (size_t i = 0; i < arg_names.size(); ++i) {
-        if (arg_names[i] == "X" || arg_names[i] == "label") continue;
-        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
-      }
-    }
-    auto toc = std::chrono::system_clock::now();
-
-    Accuracy acc;
-    val_iter.Reset();
-    while (val_iter.Next()) {
-      auto data_batch = val_iter.GetDataBatch();
-      data_batch.data.CopyTo(&args["X"]);
-      data_batch.label.CopyTo(&args["label"]);
-      // Forward pass is enough as no gradient is needed when evaluating
-      exec->Forward(false);
-      acc.Update(data_batch.label, exec->outputs[0]);
-    }
-    float duration = std::chrono::duration_cast<std::chrono::milliseconds>
-                     (toc - tic).count() / 1000.0;
-    LG << "Epoch: " << iter << " " << samples/duration << " samples/sec Accuracy: " << acc.Get();
-  }
-
-  delete exec;
-  delete opt;
-  MXNotifyShutdown();
-  CATCH
-  return 0;
-}
diff --git a/cpp-package/example/mlp_csv.cpp b/cpp-package/example/mlp_csv.cpp
deleted file mode 100644
index 8db6638a90d3..000000000000
--- a/cpp-package/example/mlp_csv.cpp
+++ /dev/null
@@ -1,276 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*
- * Example: mlp_csv
- * Description:
- * The following example demonstrates how to use CSVIter. This example creates
- * mlp (multi-layer perceptron) model and trains the MNIST data which is in
- * CSV format.
- */
-#include <chrono>
-#include <string>
-#include "utils.h"
-#include "mxnet-cpp/MxNetCpp.h"
-
-using namespace mxnet::cpp;
-
-/*
- * Implementing the mlp symbol with given hidden units configuration.
- */
-Symbol mlp(const std::vector<int> &hidden_units) {
-    auto data = Symbol::Variable("data");
-    auto label = Symbol::Variable("label");
-
-    std::vector<Symbol> weights(hidden_units.size());
-    std::vector<Symbol> biases(hidden_units.size());
-    std::vector<Symbol> outputs(hidden_units.size());
-
-    for (size_t i = 0; i < hidden_units.size(); ++i) {
-        weights[i] = Symbol::Variable("w" + std::to_string(i));
-        biases[i] = Symbol::Variable("b" + std::to_string(i));
-        Symbol fc = FullyConnected(
-                                   i == 0? data : outputs[i-1],  // data
-                                   weights[i],
-                                   biases[i],
-                                   hidden_units[i]);
-        outputs[i] = i == hidden_units.size()-1 ? fc : Activation(fc, ActivationActType::kRelu);
-    }
-    return SoftmaxOutput(outputs.back(), label);
-}
-
-/*
- * Convert the input string of number of hidden units into the vector of integers.
- */
-std::vector<int> getLayers(const std::string& hidden_units_string) {
-    std::vector<int> hidden_units;
-    char *pNext;
-    int num_unit = strtol(hidden_units_string.c_str(), &pNext, 10);
-    hidden_units.push_back(num_unit);
-    while (*pNext) {
-        num_unit = strtol(pNext, &pNext, 10);
-        hidden_units.push_back(num_unit);
-    }
-    return hidden_units;
-}
-
-void printUsage() {
-    std::cout << "Usage:" << std::endl;
-    std::cout << "mlp_csv --train mnist_training_set.csv --test mnist_test_set.csv --epochs 10 "
-    << "--batch_size 100 --hidden_units \"128 64 64\" --gpu" << std::endl;
-    std::cout << "The example uses mnist data in CSV format. The MNIST data in CSV format assumes "
-    << "the column 0 to be label and the rest 784 column to be data." << std::endl;
-    std::cout << "By default, the example uses 'cpu' context. If '--gpu' is specified, "
-    << "program uses 'gpu' context." <<std::endl;
-}
-
-int main(int argc, char** argv) {
-    const int image_size = 28;
-    const int num_mnist_features = image_size * image_size;
-    int batch_size = 100;
-    int max_epoch = 10;
-    const float learning_rate = 0.1;
-    const float weight_decay = 1e-2;
-    bool isGpu = false;
-
-    std::string training_set;
-    std::string test_set;
-    std::string hidden_units_string;
-    int index = 1;
-    while (index < argc) {
-        if (strcmp("--train", argv[index]) == 0) {
-            index++;
-            training_set = argv[index];
-        } else if (strcmp("--test", argv[index]) == 0) {
-            index++;
-            test_set = argv[index];
-        } else if (strcmp("--epochs", argv[index]) == 0) {
-            index++;
-            max_epoch = strtol(argv[index], nullptr, 10);
-        } else if (strcmp("--batch_size", argv[index]) == 0) {
-            index++;
-            batch_size = strtol(argv[index], nullptr, 10);
-        } else if (strcmp("--hidden_units", argv[index]) == 0) {
-            index++;
-            hidden_units_string = argv[index];
-        } else if (strcmp("--gpu", argv[index]) == 0) {
-            isGpu = true;
-            index++;
-        } else if (strcmp("--help", argv[index]) == 0) {
-            printUsage();
-            return 0;
-        }
-        index++;
-    }
-
-    if (training_set.empty() || test_set.empty() || hidden_units_string.empty()) {
-        std::cout << "ERROR: The mandatory arguments such as path to training and test data or "
-        << "number of hidden units for mlp are not specified." << std::endl << std::endl;
-        printUsage();
-        return 1;
-    }
-
-    std::vector<int> hidden_units = getLayers(hidden_units_string);
-
-    if (hidden_units.empty()) {
-        std::cout << "ERROR: Number of hidden units are not provided in correct format."
-        << "The numbers need to be separated by ' '." << std::endl << std::endl;
-        printUsage();
-        return 1;
-    }
-
-    /*
-     * The MNIST data in CSV format has 785 columns.
-     * The first column is "Label" and rest of the columns contain data.
-     * The mnist_train.csv has 60000 records and mnist_test.csv has
-     * 10000 records.
-     */
-    auto train_iter = MXDataIter("CSVIter")
-    .SetParam("data_csv", training_set)
-    .SetParam("data_shape", Shape(num_mnist_features + 1, 1))
-    .SetParam("batch_size", batch_size)
-    .SetParam("flat", 1)
-    .SetParam("shuffle", 0)
-    .CreateDataIter();
-
-    auto val_iter = MXDataIter("CSVIter")
-    .SetParam("data_csv", test_set)
-    .SetParam("data_shape", Shape(num_mnist_features + 1, 1))
-    .SetParam("batch_size", batch_size)
-    .SetParam("flat", 1)
-    .SetParam("shuffle", 0)
-    .CreateDataIter();
-
-    TRY
-    auto net = mlp(hidden_units);
-
-    Context ctx = Context::cpu();
-    if (isGpu) {
-        ctx = Context::gpu();
-    }
-
-    std::map<std::string, NDArray> args;
-    args["data"] = NDArray(Shape(batch_size, num_mnist_features), ctx);
-    args["label"] = NDArray(Shape(batch_size), ctx);
-    // Let MXNet infer shapes other parameters such as weights
-    net.InferArgsMap(ctx, &args, args);
-
-    // Initialize all parameters with uniform distribution U(-0.01, 0.01)
-    auto initializer = Uniform(0.01);
-    for (auto& arg : args) {
-        // arg.first is parameter name, and arg.second is the value
-        initializer(arg.first, &arg.second);
-    }
-
-    // Create sgd optimiz er
-    Optimizer* opt = OptimizerRegistry::Find("sgd");
-    opt->SetParam("rescale_grad", 1.0/batch_size)
-    ->SetParam("lr", learning_rate)
-    ->SetParam("wd", weight_decay);
-
-    // Create executor by binding parameters to the model
-    auto *exec = net.SimpleBind(ctx, args);
-    auto arg_names = net.ListArguments();
-
-    // Start training
-    for (int iter = 0; iter < max_epoch; ++iter) {
-        int samples = 0;
-        train_iter.Reset();
-
-        auto tic = std::chrono::system_clock::now();
-        while (train_iter.Next()) {
-            samples += batch_size;
-            auto data_batch = train_iter.GetDataBatch();
-
-            /*
-             * The shape of data_batch.data is (batch_size, (num_mnist_features + 1))
-             * Need to reshape this data so that label column can be extracted from this data.
-             */
-            NDArray reshapedData = data_batch.data.Reshape(Shape((num_mnist_features + 1),
-                                                                 batch_size));
-
-            /*
-             * Extract the label data by slicing the first column of the data and
-             * copy it to "label" arg.
-             */
-            reshapedData.Slice(0, 1).Reshape(Shape(batch_size)).CopyTo(&args["label"]);
-
-            /*
-             * Extract the feature data by slicing the columns 1 to 785 of the data and
-             * copy it to "data" arg.
-             */
-            reshapedData.Slice(1, (num_mnist_features + 1)).Reshape(Shape(batch_size,
-                                                                         num_mnist_features))
-                                                           .CopyTo(&args["data"]);
-
-            exec->Forward(true);
-
-            // Compute gradients
-            exec->Backward();
-            // Update parameters
-            for (size_t i = 0; i < arg_names.size(); ++i) {
-                if (arg_names[i] == "data" || arg_names[i] == "label") continue;
-                opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
-            }
-        }
-        auto toc = std::chrono::system_clock::now();
-
-        Accuracy acc;
-        val_iter.Reset();
-        while (val_iter.Next()) {
-            auto data_batch = val_iter.GetDataBatch();
-
-            /*
-             * The shape of data_batch.data is (batch_size, (num_mnist_features + 1))
-             * Need to reshape this data so that label column can be extracted from this data.
-             */
-            NDArray reshapedData = data_batch.data.Reshape(Shape((num_mnist_features + 1),
-                                                                 batch_size));
-
-            /*
-             * Extract the label data by slicing the first column of the data and
-             * copy it to "label" arg.
-             */
-            NDArray labelData = reshapedData.Slice(0, 1).Reshape(Shape(batch_size));
-            labelData.CopyTo(&args["label"]);
-
-            /*
-             * Extract the feature data by slicing the columns 1 to 785 of the data and
-             * copy it to "data" arg.
-             */
-            reshapedData.Slice(1, (num_mnist_features + 1)).Reshape(Shape(batch_size,
-                                                                         num_mnist_features))
-                                                                   .CopyTo(&args["data"]);
-
-            // Forward pass is enough as no gradient is needed when evaluating
-            exec->Forward(false);
-            acc.Update(labelData, exec->outputs[0]);
-        }
-        float duration = std::chrono::duration_cast<std::chrono::milliseconds>
-        (toc - tic).count() / 1000.0;
-        LG << "Epoch[" << iter << "]  " << samples/duration << " samples/sec Accuracy: "
-        << acc.Get();
-    }
-
-    delete exec;
-    delete opt;
-    MXNotifyShutdown();
-    CATCH
-    return 0;
-}
diff --git a/cpp-package/example/mlp_gpu.cpp b/cpp-package/example/mlp_gpu.cpp
deleted file mode 100644
index 5265de79d976..000000000000
--- a/cpp-package/example/mlp_gpu.cpp
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Xin Li yakumolx@gmail.com
- */
-#include <chrono>
-#include "utils.h"
-#include "mxnet-cpp/MxNetCpp.h"
-
-using namespace mxnet::cpp;
-
-Symbol mlp(const std::vector<int> &layers) {
-  auto x = Symbol::Variable("X");
-  auto label = Symbol::Variable("label");
-
-  std::vector<Symbol> weights(layers.size());
-  std::vector<Symbol> biases(layers.size());
-  std::vector<Symbol> outputs(layers.size());
-
-  for (size_t i = 0; i < layers.size(); ++i) {
-    weights[i] = Symbol::Variable("w" + std::to_string(i));
-    biases[i] = Symbol::Variable("b" + std::to_string(i));
-    Symbol fc = FullyConnected(
-      i == 0? x : outputs[i-1],  // data
-      weights[i],
-      biases[i],
-      layers[i]);
-    outputs[i] = i == layers.size()-1 ? fc : Activation(fc, ActivationActType::kRelu);
-  }
-
-  return SoftmaxOutput(outputs.back(), label);
-}
-
-int main(int argc, char** argv) {
-  const int image_size = 28;
-  const std::vector<int> layers{128, 64, 10};
-  const int batch_size = 100;
-  const int max_epoch = 10;
-  const float learning_rate = 0.1;
-  const float weight_decay = 1e-2;
-
-  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
-                                          "./data/mnist_data/train-labels-idx1-ubyte",
-                                          "./data/mnist_data/t10k-images-idx3-ubyte",
-                                          "./data/mnist_data/t10k-labels-idx1-ubyte"
-                                        };
-
-  auto train_iter =  MXDataIter("MNISTIter");
-  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
-    return 1;
-  }
-
-  auto val_iter = MXDataIter("MNISTIter");
-  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
-    return 1;
-  }
-
-  TRY
-  auto net = mlp(layers);
-
-  Context ctx = Context::gpu();  // Use GPU for training
-
-  std::map<std::string, NDArray> args;
-  args["X"] = NDArray(Shape(batch_size, image_size*image_size), ctx);
-  args["label"] = NDArray(Shape(batch_size), ctx);
-  // Let MXNet infer shapes of other parameters such as weights
-  net.InferArgsMap(ctx, &args, args);
-
-  // Initialize all parameters with uniform distribution U(-0.01, 0.01)
-  auto initializer = Uniform(0.01);
-  for (auto& arg : args) {
-    // arg.first is parameter name, and arg.second is the value
-    initializer(arg.first, &arg.second);
-  }
-
-  // Create sgd optimizer
-  Optimizer* opt = OptimizerRegistry::Find("sgd");
-  opt->SetParam("rescale_grad", 1.0/batch_size)
-     ->SetParam("lr", learning_rate)
-     ->SetParam("wd", weight_decay);
-  std::unique_ptr<LRScheduler> lr_sch(new FactorScheduler(5000, 0.1));
-  opt->SetLRScheduler(std::move(lr_sch));
-
-  // Create executor by binding parameters to the model
-  auto *exec = net.SimpleBind(ctx, args);
-  auto arg_names = net.ListArguments();
-
-  // Create metrics
-  Accuracy train_acc, val_acc;
-
-  // Start training
-  for (int iter = 0; iter < max_epoch; ++iter) {
-    int samples = 0;
-    train_iter.Reset();
-    train_acc.Reset();
-
-    auto tic = std::chrono::system_clock::now();
-    while (train_iter.Next()) {
-      samples += batch_size;
-      auto data_batch = train_iter.GetDataBatch();
-      // Data provided by DataIter are stored in memory, should be copied to GPU first.
-      data_batch.data.CopyTo(&args["X"]);
-      data_batch.label.CopyTo(&args["label"]);
-      // CopyTo is imperative, need to wait for it to complete.
-      NDArray::WaitAll();
-
-      // Compute gradients
-      exec->Forward(true);
-      exec->Backward();
-
-      // Update parameters
-      for (size_t i = 0; i < arg_names.size(); ++i) {
-        if (arg_names[i] == "X" || arg_names[i] == "label") continue;
-        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
-      }
-      // Update metric
-      train_acc.Update(data_batch.label, exec->outputs[0]);
-    }
-    // one epoch of training is finished
-    auto toc = std::chrono::system_clock::now();
-    float duration = std::chrono::duration_cast<std::chrono::milliseconds>
-                     (toc - tic).count() / 1000.0;
-    LG << "Epoch[" << iter << "] " << samples/duration \
-       << " samples/sec " << "Train-Accuracy=" << train_acc.Get();;
-
-    val_iter.Reset();
-    val_acc.Reset();
-    while (val_iter.Next()) {
-      auto data_batch = val_iter.GetDataBatch();
-      data_batch.data.CopyTo(&args["X"]);
-      data_batch.label.CopyTo(&args["label"]);
-      NDArray::WaitAll();
-
-      // Only forward pass is enough as no gradient is needed when evaluating
-      exec->Forward(false);
-      val_acc.Update(data_batch.label, exec->outputs[0]);
-    }
-    LG << "Epoch[" << iter << "] Val-Accuracy=" << val_acc.Get();
-  }
-
-  delete exec;
-  delete opt;
-  MXNotifyShutdown();
-  CATCH
-  return 0;
-}
diff --git a/cpp-package/example/resnet.cpp b/cpp-package/example/resnet.cpp
deleted file mode 100644
index 51dbf420ef99..000000000000
--- a/cpp-package/example/resnet.cpp
+++ /dev/null
@@ -1,283 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- */
-#include <map>
-#include <string>
-#include <fstream>
-#include <vector>
-#include <cstdlib>
-#include "utils.h"
-#include "mxnet-cpp/MxNetCpp.h"
-
-using namespace mxnet::cpp;
-
-Symbol ConvolutionNoBias(const std::string& symbol_name,
-                         Symbol data,
-                         Symbol weight,
-                         Shape kernel,
-                         int num_filter,
-                         Shape stride = Shape(1, 1),
-                         Shape dilate = Shape(1, 1),
-                         Shape pad = Shape(0, 0),
-                         int num_group = 1,
-                         int64_t workspace = 512) {
-  return Operator("Convolution")
-      .SetParam("kernel", kernel)
-      .SetParam("num_filter", num_filter)
-      .SetParam("stride", stride)
-      .SetParam("dilate", dilate)
-      .SetParam("pad", pad)
-      .SetParam("num_group", num_group)
-      .SetParam("workspace", workspace)
-      .SetParam("no_bias", true)
-      .SetInput("data", data)
-      .SetInput("weight", weight)
-      .CreateSymbol(symbol_name);
-}
-
-Symbol getConv(const std::string & name, Symbol data,
-               int  num_filter,
-               Shape kernel, Shape stride, Shape pad,
-               bool with_relu,
-               mx_float bn_momentum) {
-  Symbol conv_w(name + "_w");
-  Symbol conv = ConvolutionNoBias(name, data, conv_w,
-                                  kernel, num_filter, stride, Shape(1, 1),
-                                  pad, 1, 512);
-
-  Symbol gamma(name + "_gamma");
-  Symbol beta(name + "_beta");
-  Symbol mmean(name + "_mmean");
-  Symbol mvar(name + "_mvar");
-
-  Symbol bn = BatchNorm(name + "_bn", conv, gamma,
-                        beta, mmean, mvar, 2e-5, bn_momentum, false);
-
-  if (with_relu) {
-    return Activation(name + "_relu", bn, "relu");
-  } else {
-    return bn;
-  }
-}
-
-Symbol makeBlock(const std::string & name, Symbol data, int num_filter,
-                 bool dim_match, mx_float bn_momentum) {
-  Shape stride;
-  if (dim_match) {
-    stride = Shape(1, 1);
-  } else {
-    stride = Shape(2, 2);
-  }
-
-  Symbol conv1 = getConv(name + "_conv1", data, num_filter,
-                         Shape(3, 3), stride, Shape(1, 1),
-                         true, bn_momentum);
-
-  Symbol conv2 = getConv(name + "_conv2", conv1, num_filter,
-                         Shape(3, 3), Shape(1, 1), Shape(1, 1),
-                         false, bn_momentum);
-
-  Symbol shortcut;
-
-  if (dim_match) {
-    shortcut = data;
-  } else {
-    Symbol shortcut_w(name + "_proj_w");
-    shortcut = ConvolutionNoBias(name + "_proj", data, shortcut_w,
-                                 Shape(2, 2), num_filter,
-                                 Shape(2, 2), Shape(1, 1), Shape(0, 0),
-                                 1, 512);
-  }
-
-  Symbol fused = shortcut + conv2;
-  return Activation(name + "_relu", fused, "relu");
-}
-
-Symbol getBody(Symbol data, int num_level, int num_block, int num_filter, mx_float bn_momentum) {
-  for (int level = 0; level < num_level; level++) {
-    for (int block = 0; block < num_block; block++) {
-      data = makeBlock("level" + std::to_string(level + 1) + "_block" + std::to_string(block + 1),
-                       data, num_filter * (std::pow(2, level)),
-                       (level == 0 || block > 0), bn_momentum);
-    }
-  }
-  return data;
-}
-
-Symbol ResNetSymbol(int num_class, int num_level = 3, int num_block = 9,
-                    int num_filter = 16, mx_float bn_momentum = 0.9,
-                    mxnet::cpp::Shape pool_kernel = mxnet::cpp::Shape(8, 8)) {
-  // data and label
-  Symbol data = Symbol::Variable("data");
-  Symbol data_label = Symbol::Variable("data_label");
-
-  Symbol gamma("gamma");
-  Symbol beta("beta");
-  Symbol mmean("mmean");
-  Symbol mvar("mvar");
-
-  Symbol zscore = BatchNorm("zscore", data, gamma,
-                            beta, mmean, mvar, 0.001, bn_momentum);
-
-  Symbol conv = getConv("conv0", zscore, num_filter,
-                        Shape(3, 3), Shape(1, 1), Shape(1, 1),
-                        true, bn_momentum);
-
-  Symbol body = getBody(conv, num_level, num_block, num_filter, bn_momentum);
-
-  Symbol pool = Pooling("pool", body, pool_kernel, PoolingPoolType::kAvg);
-
-  Symbol flat = Flatten("flatten", pool);
-
-  Symbol fc_w("fc_w"), fc_b("fc_b");
-  Symbol fc = FullyConnected("fc", flat, fc_w, fc_b, num_class);
-
-  return SoftmaxOutput("softmax", fc, data_label);
-}
-
-NDArray ResizeInput(NDArray data, const Shape new_shape) {
-  NDArray pic = data.Reshape(Shape(0, 1, 28, 28));
-  NDArray pic_1channel;
-  Operator("_contrib_BilinearResize2D")
-    .SetParam("height", new_shape[2])
-    .SetParam("width", new_shape[3])
-    (pic).Invoke(pic_1channel);
-  NDArray output;
-  Operator("tile")
-    .SetParam("reps", Shape(1, 3, 1, 1))
-    (pic_1channel).Invoke(output);
-  return output;
-}
-
-int main(int argc, char const *argv[]) {
-  int max_epoch = argc > 1 ? strtol(argv[1], nullptr, 10) : 100;
-  float learning_rate = 1e-4;
-  float weight_decay = 1e-4;
-
-  TRY
-  auto resnet = ResNetSymbol(10);
-  std::map<std::string, NDArray> args_map;
-  std::map<std::string, NDArray> aux_map;
-
-  /*context*/
-  auto ctx = Context::cpu();
-  int num_gpu;
-  MXGetGPUCount(&num_gpu);
-  int batch_size = 8;
-#if !MXNET_USE_CPU
-  if (num_gpu > 0) {
-    ctx = Context::gpu();
-    batch_size = 32;
-  }
-#endif
-
-  const Shape data_shape = Shape(batch_size, 3, 224, 224),
-              label_shape = Shape(batch_size);
-  args_map["data"] = NDArray(data_shape, ctx);
-  args_map["data_label"] = NDArray(label_shape, ctx);
-  resnet.InferArgsMap(ctx, &args_map, args_map);
-
-  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
-                                          "./data/mnist_data/train-labels-idx1-ubyte",
-                                          "./data/mnist_data/t10k-images-idx3-ubyte",
-                                          "./data/mnist_data/t10k-labels-idx1-ubyte"
-                                        };
-
-  auto train_iter =  MXDataIter("MNISTIter");
-  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
-    return 1;
-  }
-
-  auto val_iter = MXDataIter("MNISTIter");
-  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
-    return 1;
-  }
-
-  // initialize parameters
-  Xavier xavier = Xavier(Xavier::gaussian, Xavier::in, 2);
-  for (auto &arg : args_map) {
-    xavier(arg.first, &arg.second);
-  }
-
-  Optimizer* opt = OptimizerRegistry::Find("sgd");
-  opt->SetParam("lr", learning_rate)
-     ->SetParam("wd", weight_decay)
-     ->SetParam("momentum", 0.9)
-     ->SetParam("rescale_grad", 1.0 / batch_size)
-     ->SetParam("clip_gradient", 10);
-
-  auto *exec = resnet.SimpleBind(ctx, args_map);
-  auto arg_names = resnet.ListArguments();
-
-  // Create metrics
-  Accuracy train_acc, val_acc;
-  LogLoss logloss_train, logloss_val;
-  for (int epoch = 0; epoch < max_epoch; ++epoch) {
-    LG << "Epoch: " << epoch;
-    train_iter.Reset();
-    train_acc.Reset();
-    int iter = 0;
-    while (train_iter.Next()) {
-      auto data_batch = train_iter.GetDataBatch();
-      ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]);
-      data_batch.label.CopyTo(&args_map["data_label"]);
-      NDArray::WaitAll();
-
-      exec->Forward(true);
-      exec->Backward();
-
-      for (size_t i = 0; i < arg_names.size(); ++i) {
-        if (arg_names[i] == "data" || arg_names[i] == "data_label") continue;
-        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
-      }
-      NDArray::WaitAll();
-      train_acc.Update(data_batch.label, exec->outputs[0]);
-      logloss_train.Reset();
-      logloss_train.Update(data_batch.label, exec->outputs[0]);
-      ++iter;
-      LG << "EPOCH: " << epoch << " ITER: " << iter
-         << " Train Accuracy: " << train_acc.Get()
-         << " Train Loss: " << logloss_train.Get();
-    }
-    LG << "EPOCH: " << epoch << " Train Accuracy: " << train_acc.Get();
-
-    val_iter.Reset();
-    val_acc.Reset();
-    iter = 0;
-    while (val_iter.Next()) {
-      auto data_batch = val_iter.GetDataBatch();
-      ResizeInput(data_batch.data, data_shape).CopyTo(&args_map["data"]);
-      data_batch.label.CopyTo(&args_map["data_label"]);
-      NDArray::WaitAll();
-      exec->Forward(false);
-      NDArray::WaitAll();
-      val_acc.Update(data_batch.label, exec->outputs[0]);
-      LG << "EPOCH: " << epoch << " ITER: " << iter << " Val Accuracy: " << val_acc.Get();
-      ++iter;
-    }
-    LG << "Validation Accuracy: " << val_acc.Get();
-  }
-  delete exec;
-  delete opt;
-  MXNotifyShutdown();
-  CATCH
-  return 0;
-}
diff --git a/cpp-package/example/test_regress_label.cpp b/cpp-package/example/test_regress_label.cpp
deleted file mode 100644
index 8ef9d000922c..000000000000
--- a/cpp-package/example/test_regress_label.cpp
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- * 
- * This file is used for testing LinearRegressionOutput can
- *   still bind if label is not provided
- */
-
-#include <iostream>
-#include <vector>
-#include <string>
-#include "dmlc/logging.h"
-#include "mxnet-cpp/MxNetCpp.h"
-
-using namespace mxnet::cpp;
-
-int main() {
-    LOG(INFO) << "Running LinearRegressionOutput symbol testing, "
-                 "executor should be able to bind without label.";
-    Symbol data = Symbol::Variable("data");
-    Symbol label = Symbol::Variable("regress_label");
-    Symbol symbol = LinearRegressionOutput(data, label);
-    std::map<std::string, mxnet::cpp::OpReqType> opReqMap;
-    for (const auto& iter : symbol.ListArguments()) {
-        opReqMap[iter] = mxnet::cpp::OpReqType::kNullOp;
-    }
-    std::map<std::string, mxnet::cpp::NDArray> argMap({
-        {"data", NDArray(Shape{1, 3}, Context::cpu(), true)}
-    });
-
-    try {
-        symbol.SimpleBind(Context::cpu(),
-                argMap,
-                std::map<std::string, mxnet::cpp::NDArray>(),
-                opReqMap,
-                std::map<std::string, mxnet::cpp::NDArray>());
-    } catch (const std::exception& e) {
-        LOG(ERROR) << "Error binding the symbol: " << MXGetLastError() << " " << e.what();
-        throw;
-    }
-    MXNotifyShutdown();
-    return 0;
-}
diff --git a/cpp-package/example/test_score.cpp b/cpp-package/example/test_score.cpp
deleted file mode 100644
index 0ccdf65b3b19..000000000000
--- a/cpp-package/example/test_score.cpp
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Xin Li yakumolx@gmail.com
- * The file is used for testing if the score(accurary) we get
- * is better than the threshold we set using mlp model.
- * By running: build/test_score 0.75
- * 0.75 here means the threshold score
- * It return 0 if we can achieve higher score than threshold, otherwise 1
- */
-#include <chrono>
-#include "utils.h"
-#include "mxnet-cpp/MxNetCpp.h"
-
-using namespace mxnet::cpp;
-
-Symbol mlp(const std::vector<int> &layers) {
-  auto x = Symbol::Variable("X");
-  auto label = Symbol::Variable("label");
-
-  std::vector<Symbol> weights(layers.size());
-  std::vector<Symbol> biases(layers.size());
-  std::vector<Symbol> outputs(layers.size());
-
-  for (size_t i = 0; i < layers.size(); ++i) {
-    weights[i] = Symbol::Variable("w" + std::to_string(i));
-    biases[i] = Symbol::Variable("b" + std::to_string(i));
-    Symbol fc = FullyConnected(
-      i == 0? x : outputs[i-1],  // data
-      weights[i],
-      biases[i],
-      layers[i]);
-    outputs[i] = i == layers.size()-1? fc : Activation(fc, ActivationActType::kRelu);
-  }
-
-  return SoftmaxOutput(outputs.back(), label);
-}
-
-int main(int argc, char** argv) {
-  const float MIN_SCORE = std::stof(argv[1]);
-
-  const int image_size = 28;
-  const std::vector<int> layers{128, 64, 10};
-  const int batch_size = 100;
-  const int max_epoch = 10;
-  const float learning_rate = 0.1;
-  const float weight_decay = 1e-2;
-  float score = 0;
-
-  std::vector<std::string> data_files = { "./data/mnist_data/train-images-idx3-ubyte",
-                                          "./data/mnist_data/train-labels-idx1-ubyte",
-                                          "./data/mnist_data/t10k-images-idx3-ubyte",
-                                          "./data/mnist_data/t10k-labels-idx1-ubyte"
-                                        };
-
-  auto train_iter =  MXDataIter("MNISTIter");
-  if (!setDataIter(&train_iter, "Train", data_files, batch_size)) {
-    return 1;
-  }
-
-  auto val_iter = MXDataIter("MNISTIter");
-  if (!setDataIter(&val_iter, "Label", data_files, batch_size)) {
-    return 1;
-  }
-
-  TRY
-  auto net = mlp(layers);
-
-  Context ctx = Context::gpu();  // Use GPU for training
-#if MXNET_USE_CPU
-  ctx = Context::cpu();
-#endif
-
-  std::map<std::string, NDArray> args;
-  args["X"] = NDArray(Shape(batch_size, image_size*image_size), ctx);
-  args["label"] = NDArray(Shape(batch_size), ctx);
-  // Let MXNet infer shapes of other parameters such as weights
-  net.InferArgsMap(ctx, &args, args);
-
-  // Initialize all parameters with uniform distribution U(-0.01, 0.01)
-  auto initializer = Uniform(0.01);
-  for (auto& arg : args) {
-    // arg.first is parameter name, and arg.second is the value
-    initializer(arg.first, &arg.second);
-  }
-
-  // Create sgd optimizer
-  Optimizer* opt = OptimizerRegistry::Find("sgd");
-  opt->SetParam("rescale_grad", 1.0/batch_size)
-     ->SetParam("lr", learning_rate)
-     ->SetParam("wd", weight_decay);
-  std::unique_ptr<LRScheduler> lr_sch(new FactorScheduler(5000, 0.1));
-  opt->SetLRScheduler(std::move(lr_sch));
-
-  // Create executor by binding parameters to the model
-  auto *exec = net.SimpleBind(ctx, args);
-  auto arg_names = net.ListArguments();
-
-  // Start training
-  for (int iter = 0; iter < max_epoch; ++iter) {
-    int samples = 0;
-    train_iter.Reset();
-
-    auto tic = std::chrono::system_clock::now();
-    while (train_iter.Next()) {
-      samples += batch_size;
-      auto data_batch = train_iter.GetDataBatch();
-      // Data provided by DataIter are stored in memory, should be copied to GPU first.
-      data_batch.data.CopyTo(&args["X"]);
-      data_batch.label.CopyTo(&args["label"]);
-      // CopyTo is imperative, need to wait for it to complete.
-      NDArray::WaitAll();
-
-      // Compute gradients
-      exec->Forward(true);
-      exec->Backward();
-      // Update parameters
-      for (size_t i = 0; i < arg_names.size(); ++i) {
-        if (arg_names[i] == "X" || arg_names[i] == "label") continue;
-        opt->Update(i, exec->arg_arrays[i], exec->grad_arrays[i]);
-      }
-    }
-    auto toc = std::chrono::system_clock::now();
-
-    Accuracy acc;
-    val_iter.Reset();
-    while (val_iter.Next()) {
-      auto data_batch = val_iter.GetDataBatch();
-      data_batch.data.CopyTo(&args["X"]);
-      data_batch.label.CopyTo(&args["label"]);
-      NDArray::WaitAll();
-      // Only forward pass is enough as no gradient is needed when evaluating
-      exec->Forward(false);
-      acc.Update(data_batch.label, exec->outputs[0]);
-    }
-    float duration = std::chrono::duration_cast<std::chrono::milliseconds>
-                     (toc - tic).count() / 1000.0;
-    LG << "Epoch: " << iter << " " << samples/duration << " samples/sec Accuracy: " << acc.Get();
-    score = acc.Get();
-  }
-
-  delete exec;
-  delete opt;
-  MXNotifyShutdown();
-  CATCH
-  return score >= MIN_SCORE ? 0 : 1;
-}
diff --git a/cpp-package/example/unittests/unit_test_mlp_csv.sh b/cpp-package/example/unittests/unit_test_mlp_csv.sh
deleted file mode 100755
index 55ddcdecaafd..000000000000
--- a/cpp-package/example/unittests/unit_test_mlp_csv.sh
+++ /dev/null
@@ -1,63 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# This file is a unit test for mlp_csv.cpp example in 'example' directory.
-# The file
-#    1. Downloads the MNIST data,
-#    2. Converts it into CSV format.
-#    3. Runs the mlp_csv example and ensures that the accuracy is more than expected.
-#
-
-#!/bin/bash
-
-set -e # exit on the first error
-export EXE_NAME=mlp_csv
-
-cd $(dirname $(readlink -f $0))/../
-export LD_LIBRARY_PATH=$(readlink -f ../../lib):$LD_LIBRARY_PATH
-
-if [ ! -f ../../build/cpp-package/example/${EXE_NAME} ];
-then
-echo "FAIL: ${EXE_NAME} does not exist"
-exit
-fi
-
-cp ../../build/cpp-package/example/${EXE_NAME} .
-
-./get_data.sh
-python mnist_to_csv.py ./data/mnist_data/train-images-idx3-ubyte ./data/mnist_data/train-labels-idx1-ubyte ./data/mnist_data/mnist_train.csv 60000
-python mnist_to_csv.py ./data/mnist_data/t10k-images-idx3-ubyte ./data/mnist_data/t10k-labels-idx1-ubyte ./data/mnist_data/mnist_test.csv 10000
-
-./${EXE_NAME} --train ./data/mnist_data/mnist_train.csv --test ./data/mnist_data/mnist_test.csv --epochs 10 --batch_size 100 --hidden_units "128 64 10" 2&> ${EXE_NAME}.log
-
-if [ ! -f ${EXE_NAME}.log ];
-then
-echo "FAIL: Log file ${EXE_NAME}.log does not exist."
-exit
-fi
-
-# Obtain the accuracy achieved by mlp model after training with MNIST data in CSV format.
-export Acc_obtained=`grep -oP '.*\K(?<=Accuracy: ).*$' ${EXE_NAME}.log | tail -1 | tr -d '\n'`
-export Acc_expected=0.98
-
-# If the obtained accuracy does not meet the expected accuracy, report the test as FAIL.
-if [ $(echo "$Acc_obtained $Acc_expected" | awk '{printf($1 >= $2) ? 1 : 0}') -eq 1 ] ;
-then
-echo "PASS: ${EXE_NAME} obtained $Acc_obtained accuracy."
-else
-echo "FAIL: Accuracy = $Acc_obtained is less than expected accuracy $Acc_expected."
-fi
diff --git a/cpp-package/tests/ci_test.sh b/cpp-package/tests/ci_test.sh
index 39f9e06861b3..d04522ded198 100755
--- a/cpp-package/tests/ci_test.sh
+++ b/cpp-package/tests/ci_test.sh
@@ -24,47 +24,15 @@ ls -l ../../lib/
 
 ./get_data.sh
 
-cp ../../build/cpp-package/example/lenet .
-./lenet 1
-
-cp ../../build/cpp-package/example/alexnet .
-./alexnet 1
-
-cp ../../build/cpp-package/example/lenet_with_mxdataiter .
-./lenet_with_mxdataiter 1
-
-cp ../../build/cpp-package/example/resnet .
-./resnet 1
-
-cp ../../build/cpp-package/example/inception_bn .
-./inception_bn 1
-
-cp ../../build/cpp-package/example/mlp .
-./mlp 150
-
-cp ../../build/cpp-package/example/mlp_cpu .
-./mlp_cpu
-
-cp ../../build/cpp-package/example/mlp_gpu .
-./mlp_gpu
-
 cp ../../build/cpp-package/example/test_optimizer .
 ./test_optimizer
 
 cp ../../build/cpp-package/example/test_kvstore .
 ./test_kvstore
 
-cp ../../build/cpp-package/example/test_score .
-./test_score 0.93
-
 cp ../../build/cpp-package/example/test_ndarray_copy .
 ./test_ndarray_copy
 
-cp ../../build/cpp-package/example/test_regress_label .
-./test_regress_label
-
-sh unittests/unit_test_mlp_csv.sh
-
 cd inference
 
 cp ../../../build/cpp-package/example/sentiment_analysis_rnn .
diff --git a/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md b/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md
index 8cd29f2a32b3..61ee25784817 100644
--- a/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md
+++ b/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md
@@ -243,9 +243,9 @@ Despite that there are 2 classes, there should be only one output neuron, becaus
 
 For `SigmoidBinaryCrossEntropyLoss` to work it is required that classes were encoded as 0 and 1. In some datasets the class encoding might be different, like -1 and 1 or 1 and 2. If this is how your dataset looks like, then you need to re-encode the data before using `SigmoidBinaryCrossEntropyLoss`.
 
-## Tip 3: Use SigmoidBinaryCrossEntropyLoss instead of LogisticRegressionOutput
+## Tip 3: Use SigmoidBinaryCrossEntropyLoss
 
-NDArray API has two options to calculate logistic regression loss: [SigmoidBinaryCrossEntropyLoss](https://mxnet.apache.org/api/python/gluon/loss.html#mxnet.gluon.loss.SigmoidBinaryCrossEntropyLoss) and [LogisticRegressionOutput](https://mxnet.apache.org/api/python/ndarray/ndarray.html#mxnet.ndarray.LogisticRegressionOutput). `LogisticRegressionOutput` is designed to be an output layer when using the Module API, and is not supposed to be used when using Gluon API.
+NDArray API has an options to calculate logistic regression loss: [SigmoidBinaryCrossEntropyLoss](https://mxnet.apache.org/api/python/gluon/loss.html#mxnet.gluon.loss.SigmoidBinaryCrossEntropyLoss).
 
 ## Conclusion
 
diff --git a/docs/python_docs/python/tutorials/performance/backend/amp.md b/docs/python_docs/python/tutorials/performance/backend/amp.md
index c18ff29ef4c1..289e783a0fb8 100644
--- a/docs/python_docs/python/tutorials/performance/backend/amp.md
+++ b/docs/python_docs/python/tutorials/performance/backend/amp.md
@@ -39,7 +39,6 @@ import numpy as np
 import mxnet as mx
 import mxnet.gluon as gluon
 from mxnet import autograd
-from mxnet.test_utils import download_model
 import gluoncv as gcv
 from gluoncv.model_zoo import get_model
 
@@ -291,4 +290,3 @@ Also, you can force cast the params wherever possible to FP16.
 ## Current limitations of AMP
 
 - AMP's dynamic loss scaling currently supports only Gluon trainer with `update_on_kvstore=False` option set
-- Using `SoftmaxOutput`, `LinearRegressionOutput`, `LogisticRegressionOutput`, `MAERegressionOutput` with dynamic loss scaling does not work when training networks with multiple Gluon trainers and so multiple loss scales
diff --git a/docs/static_site/src/pages/api/cpp/docs/tutorials/basics.md b/docs/static_site/src/pages/api/cpp/docs/tutorials/basics.md
index 51077bfd0e41..befbabd606eb 100644
--- a/docs/static_site/src/pages/api/cpp/docs/tutorials/basics.md
+++ b/docs/static_site/src/pages/api/cpp/docs/tutorials/basics.md
@@ -78,120 +78,6 @@ auto val_iter = MXDataIter("MNISTIter")
 The data have been successfully loaded. We can now easily construct various models to identify
 the digits with the help of C++ package.
 
-
-Multilayer Perceptron
----------------------
-If you are not familiar with multilayer perceptron, you can get some basic information
-[here](https://mxnet.apache.org/api/python/docs/tutorials/packages/gluon/image/mnist.html). We only focus on
-the implementation in this tutorial.
-
-Constructing multilayer perceptron model is straightforward, assume we store the hidden size
-for each layer in `layers`, and each layer uses
-[ReLu](https://en.wikipedia.org/wiki/Rectifier_(neural_networks)) function as activation.
-
-```c++
-Symbol mlp(const vector<int> &layers) {
-  auto x = Symbol::Variable("X");
-  auto label = Symbol::Variable("label");
-
-  vector<Symbol> weights(layers.size());
-  vector<Symbol> biases(layers.size());
-  vector<Symbol> outputs(layers.size());
-
-  for (int i=0; i<layers.size(); ++i) {
-    weights[i] = Symbol::Variable("w" + to_string(i));
-    biases[i] = Symbol::Variable("b" + to_string(i));
-    Symbol fc = FullyConnected(
-      i == 0? x : outputs[i-1]
-      weights[i],
-      biases[i],
-      layers[i]
-    );
-    outputs[i] = i == layers.size()-1 ? fc : Activation(fc, ActivationActType::relu);
-  }
-
-  return SoftmaxOutput(outputs.back(), label);
-}
-```
-
-The above function defines a multilayer perceptron model where hidden sizes are specified
-by `layers`.
-
-We now create and initialize the parameters after the model is constructed. MXNet can help
- you to infer shapes of most of the parameters. Basically only the shape of data and label
- is needed.
-
-```c++
-std::map<string, NDArray> args;
-args["X"] = NDArray(Shape(batch_size, image_size*image_size), ctx);
-args["label"] = NDArray(Shape(batch_size), ctx);
-// Let MXNet infer shapes other parameters such as weights
-net.InferArgsMap(ctx, &args, args);
-
-// Initialize all parameters with uniform distribution U(-0.01, 0.01)
-auto initializer = Uniform(0.01);
-for (auto& arg : args) {
-  // arg.first is parameter name, and arg.second is the value
-  initializer(arg.first, &arg.second);
-}
-```
-
-The rest is to train the model with an optimizer.
-```c++
-// Create sgd optimizer
-Optimizer* opt = OptimizerRegistry::Find("sgd");
-opt->SetParam("rescale_grad", 1.0/batch_size);
-
-// Start training
-for (int iter = 0; iter < max_epoch; ++iter) {
-  train_iter.Reset();
-
-  while (train_iter.Next()) {
-    auto data_batch = train_iter.GetDataBatch();
-    // Set data and label
-    args["X"] = data_batch.data;
-    args["label"] = data_batch.label;
-
-    // Create executor by binding parameters to the model
-    auto *exec = net.SimpleBind(ctx, args);
-    // Compute gradients
-    exec->Forward(true);
-    exec->Backward();
-    // Update parameters
-    exec->UpdateAll(opt, learning_rate, weight_decay);
-    // Remember to free the memory
-    delete exec;
-  }
-}
-```
-
-We also want to see how our model performs. The C++ package provides convenient APIs for
-evaluating. Here we use accuracy as metric. The inference is almost the same as training,
- except that we don't need gradients.
-
-```c++
-Accuracy acc;
-val_iter.Reset();
-while (val_iter.Next()) {
-  auto data_batch = val_iter.GetDataBatch();
-  args["X"] = data_batch.data;
-  args["label"] = data_batch.label;
-  auto *exec = net.SimpleBind(ctx, args);
-  // Forward pass is enough as no gradient is needed when evaluating
-  exec->Forward(false);
-  acc.Update(data_batch.label, exec->outputs[0]);
-  delete exec;
-}
-```
-
-You can find the complete code in `mlp_cpu.cpp`. Use `make mlp_cpu` to compile it,
- and `./mlp_cpu` to run it. If it complains that the shared library `libmxnet.so` is not found
- after typing `./mlp_cpu`, you will need to specify the path to the shared library in
- the environment variable `LD_LIBRARY_PATH` in Linux and `DYLD_LIBRARY_PATH`
- in MacOS. For example, if you are using MacOS, typing
- `DYLD_LIBRARY_PATH+=. ./mlp_cpu` would solve the problem. It basically tells the system
- to find the shared library under the current directory since we have just copied it here.
-
 GPU Support
 -----------
 It's worth noting that changing context from `Context::cpu()` to `Context::gpu()` is not enough,
diff --git a/docs/static_site/src/pages/api/cpp/docs/tutorials/multi_threaded_inference.md b/docs/static_site/src/pages/api/cpp/docs/tutorials/multi_threaded_inference.md
index d0b38a015656..f490aa12e6fc 100644
--- a/docs/static_site/src/pages/api/cpp/docs/tutorials/multi_threaded_inference.md
+++ b/docs/static_site/src/pages/api/cpp/docs/tutorials/multi_threaded_inference.md
@@ -81,17 +81,9 @@ $ make
 
 If you have built mxnet from source with cmake, please uncomment the specific lines for cmake build or set the following environment variables: `MKLDNN_BUILD_DIR (default is $(MXNET_ROOT)/3rdparty/mkldnn/build)`, `MKLDNN_INCLUDE_DIR (default is $(MXNET_ROOT)/3rdparty/mkldnn/include)`, `MXNET_LIB_DIR (default is $(MXNET_ROOT)/lib)`.
 
-### Download the model and run multi threaded inference example
-To download a model use the `get_model.py` script. This downloads a model to run inference.
-
-```python
-python3 get_model.py --model <model_name>
-```
-e.g.
-```python
-python3 get_model.py --model imagenet1k-inception-bn
-```
-Only the supported models with `get_model.py` work with multi threaded inference.
+### Run multi threaded inference example
+The example is tested with models such as `imagenet1k-inception-bn`, `imagenet1k-resnet-50`,
+`imagenet1k-resnet-152`, `imagenet1k-resnet-18`
 
 To run the multi threaded inference example:
 
diff --git a/docs/static_site/src/pages/api/faq/caffe.md b/docs/static_site/src/pages/api/faq/caffe.md
index 147ffd7bc428..ba84b8b590be 100644
--- a/docs/static_site/src/pages/api/faq/caffe.md
+++ b/docs/static_site/src/pages/api/faq/caffe.md
@@ -82,7 +82,6 @@ act1 = mx.sym.CaffeOp(data_0=fc1, prototxt="layer{type:\"TanH\"}")
 fc2  = mx.sym.CaffeOp(data_0=act1, num_weight=2, name='fc2', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 64} }")
 act2 = mx.sym.CaffeOp(data_0=fc2, prototxt="layer{type:\"TanH\"}")
 fc3 = mx.sym.CaffeOp(data_0=act2, num_weight=2, name='fc3', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 10}}")
-mlp = mx.sym.SoftmaxOutput(data=fc3, name='softmax')
 ```
 
 Let's break it down. First, `data = mx.sym.Variable('data')` defines a variable
diff --git a/docs/static_site/src/pages/api/faq/float16.md b/docs/static_site/src/pages/api/faq/float16.md
index 8a6d413449a6..aace11ad261f 100644
--- a/docs/static_site/src/pages/api/faq/float16.md
+++ b/docs/static_site/src/pages/api/faq/float16.md
@@ -104,77 +104,9 @@ You can check the parameters of the model by calling [summary](/api/python/docs/
 net.summary(mx.nd.uniform(shape=(1, 3, 224, 224), dtype=np.float16))
 ```
 
-## Using the Symbolic API
-
-Training a network in float16 with the Symbolic API involves the following steps.
-
-1. Add a layer at the beginning of the network, to cast the data to float16. This will ensure that all the following layers compute in float16.
-2. It is advisable to cast the output of the layers before softmax to float32, so that the softmax computation is done in float32. This is because softmax involves large reductions and it helps to keep that in float32 for more precise answer.
-3. It is advisable to use the multi-precision mode of the optimizer for more precise weight updates. Here's how you would enable this mode when creating an optimizer.
-
-```python
-optimizer = mx.optimizer.create('sgd', multi_precision=True, lr=0.01)
-```
-
-For a full example, please refer to [resnet.py](https://github.com/apache/incubator-mxnet/blob/master/example/image-classification/symbols/resnet.py) file on GitHub. A small, relevant excerpt from that file is presented below.
-
-```python
-data = mx.sym.Variable(name="data")
-
-if dtype == 'float16':
-    data = mx.sym.Cast(data=data, dtype=np.float16)
-
-# ... the rest of the network
-net_out = net(data)
-
-if dtype == 'float16':
-    net_out = mx.sym.Cast(data=net_out, dtype=np.float32)
-
-output = mx.sym.SoftmaxOutput(data=net_out, name='softmax')
-```
-
-If you would like to train ResNet50 model on ImageNet using float16 precision, you can find the full script [here](https://github.com/apache/incubator-mxnet/blob/master/docs/static_site/src/pages/api/faq/float16.md)
-
-If you don't have ImageNet dataset at your disposal, you can still run the script above using synthetic float16 data by providing the following command:
-
-```bash
-python train_imagenet.py --network resnet-v1 --num-layers 50 --benchmark 1 --gpus 0 --batch-size 256 --dtype float16
-```
-
-There's a similar example for float16 fine tuning [here](https://github.com/apache/incubator-mxnet/tree/master/example/image-classification/fine-tune.py) of selected models: Inception v3, Inception v4, ResNetV1, ResNet50, ResNext or VGG. The command below shows how to use that script to fine-tune a Resnet50 model trained on Imagenet for the Caltech 256 dataset using float16.
-
-```bash
-python fine-tune.py --network resnet --num-layers 50 --pretrained-model imagenet1k-resnet-50 --data-train ~/.mxnet/dataset/caltech-256/caltech256-train.rec --data-val ~/data/caltech-256/caltech256-val.rec --num-examples 15420 --num-classes 256 --gpus 0 --batch-size 64 --dtype float16
-```
-
-If you don't have the `Caltech256` dataset, you can download it using the script below, and convert it into .rec file format using [im2rec utility file](https://github.com/apache/incubator-mxnet/blob/master/tools/im2rec.py)
-
-```python
-import os
-from os.path import expanduser
-import tarfile
-import mxnet as mx
-
-
-data_folder = expanduser("~/.mxnet/datasets/")
-dataset_name = "256_ObjectCategories"
-archive_file = "{}.tar".format(dataset_name)
-archive_path = os.path.join(data_folder, archive_file)
-data_url = "http://www.vision.caltech.edu/Image_Datasets/Caltech256/"
-
-if not os.path.isfile(archive_path):
-    mx.test_utils.download("{}{}".format(data_url, archive_file),
-                           dirname=data_folder)
-    print('Extracting {} in {}...'.format(archive_file, data_folder))
-    tar = tarfile.open(archive_path)
-    tar.extractall(data_folder)
-    tar.close()
-    print('Data extracted.')
-```
-
 ## Example training results
 
-Let us consider training a Resnet50V1 model on the ImageNet 2012 dataset. For this model, the GPU memory usage is close to the capacity of V100 GPU with a batch size of 128 when using float32. Using float16 allows the use of 256 batch size. Shared below are results using 8 V100 GPUs on a an [AWS p3.16xlarge](https://aws.amazon.com/ec2/instance-types/p3/#Amazon_EC2_P3_Instance_Product_Details) instance.
+Let us consider training a Resnet50-V1 model on the ImageNet 2012 dataset. For this model, the GPU memory usage is close to the capacity of V100 GPU with a batch size of 128 when using float32. Using float16 allows the use of 256 batch size. Shared below are results using 8 V100 GPUs on a an [AWS p3.16xlarge](https://aws.amazon.com/ec2/instance-types/p3/#Amazon_EC2_P3_Instance_Product_Details) instance.
 
 Let us compare the three scenarios that arise here: float32 with 1024 batch size, float16 with 1024 batch size and float16 with 2048 batch size. These jobs trained for 90 epochs using a learning rate of 0.4 for 1024 batch size and 0.8 for 2048 batch size. This learning rate was decayed by a factor of 0.1 at the 30th, 60th and 80th epochs. The only changes made for the float16 jobs when compared to the float32 job were that the network and data were cast to float16, and the multi-precision mode was used for optimizer. The final accuracy at 90th epoch and the time to train are tabulated below for these three scenarios. The top-1 validation errors at the end of each epoch are also plotted below.
 
@@ -233,15 +165,6 @@ optimizer = mx.optimizer.create('sgd',
                                 rescale_grad=1.0/128)
 ```
 
-*Module API*
-
-```python
-mxnet.sym.SoftmaxOutput(other_args, grad_scale=128.0)
-optimizer = mx.optimizer.create('sgd',
-                                multi_precision=True,
-                                rescale_grad=1.0/128)
-```
-
 Networks like Multibox SSD, R-CNN, bigLSTM and Seq2seq were found to exhibit such behavior.
 You can choose a constant scaling factor while ensuring that the absolute value of gradient when multiplied by this factor remains in the range of float16. Generally powers of 2 like 64, 128, 256, 512 are chosen. Refer to the linked articles below for more details on this.
 
@@ -253,4 +176,4 @@ You can choose a constant scaling factor while ensuring that the absolute value
 
 ## Recommended Next Steps
 
-* Check out our video tutorial on [Using Mixed Precision with MXNet](https://www.youtube.com/watch?v=pR4KMh1lGC0)
\ No newline at end of file
+* Check out our video tutorial on [Using Mixed Precision with MXNet](https://www.youtube.com/watch?v=pR4KMh1lGC0)
diff --git a/docs/static_site/src/pages/api/faq/visualize_graph.md b/docs/static_site/src/pages/api/faq/visualize_graph.md
deleted file mode 100644
index 8d477779b54f..000000000000
--- a/docs/static_site/src/pages/api/faq/visualize_graph.md
+++ /dev/null
@@ -1,88 +0,0 @@
----
-layout: page_category
-title: Visualize Neural Networks
-category: faq
-faq_c: Model
-question: How do I visualize neural networks as computation graphs?
-permalink: /api/faq/visualize_graph
----
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-# How to visualize Neural Networks as computation graph
-
-Here, we'll demonstrate how to use ```mx.viz.plot_network```
-for visualizing your neural networks. ```mx.viz.plot_network```
-represents the neural network as a computation graph consisting of nodes and edges.
-The visualizations make clear which nodes correspond to inputs,
-where the computation starts,
-and which correspond to output nodes,
-from which the result can be read.
-
-## Prerequisites
-You need the [Jupyter Notebook](http://jupyter.readthedocs.io/en/latest/)
-and [Graphviz](https://www.graphviz.org/) libraries to visualize the network.
-Please make sure you have followed [installation instructions]({{'get_started'|relative_url}})
-in setting up above dependencies along with setting up MXNet.
-
-## Visualize the sample Neural Network
-
-```mx.viz.plot_network``` takes [Symbol]({{'/api/python/docs/api/symbol/index'|relative}}), with your Network definition, and optional node_attrs, parameters for the shape of the node in the graph,  as input and generates a computation graph.
-
-We will now try to visualize a sample Neural Network for linear matrix factorization:
-- Start Jupyter notebook server
-```bash
-  $ jupyter notebook
-```
-- Access Jupyter notebook in your browser - http://localhost:8888/.
-- Create a new notebook - "File -> New Notebook -> Python 2"
-- Copy and run below code to visualize a simple network.
-
-```python
-import mxnet as mx
-user = mx.symbol.Variable('user')
-item = mx.symbol.Variable('item')
-score = mx.symbol.Variable('score')
-
-# Set dummy dimensions
-k = 64
-max_user = 100
-max_item = 50
-
-# user feature lookup
-user = mx.symbol.Embedding(data = user, input_dim = max_user, output_dim = k)
-
-# item feature lookup
-item = mx.symbol.Embedding(data = item, input_dim = max_item, output_dim = k)
-
-# predict by the inner product, which is elementwise product and then sum
-net = user * item
-net = mx.symbol.sum_axis(data = net, axis = 1)
-net = mx.symbol.Flatten(data = net)
-
-# loss layer
-net = mx.symbol.LinearRegressionOutput(data = net, label = score)
-
-# Visualize your network
-mx.viz.plot_network(net)
-```
-You should see computation graph something like the following image:
-<img src=https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/image/SampleNetworkVisualization.png
-width=400/>
-
-# References
-* [Example MXNet Matrix Factorization](https://github.com/dmlc/mxnet/blob/master/example/recommenders/demo1-MF.ipynb)
-* [Visualizing CNN Architecture of MXNet Tutorials](http://josephpcohen.com/w/visualizing-cnn-architectures-side-by-side-with-mxnet/)
diff --git a/docs/static_site/src/pages/api/perl/docs/tutorials/io.md b/docs/static_site/src/pages/api/perl/docs/tutorials/io.md
index 4e6078740007..12608d8c445e 100644
--- a/docs/static_site/src/pages/api/perl/docs/tutorials/io.md
+++ b/docs/static_site/src/pages/api/perl/docs/tutorials/io.md
@@ -54,22 +54,6 @@ DataDesc[softmax_label,25,float32,NCHW]
 So this iterator can be used to train a symbol whose input data variable has
 name `data` and input label variable has name `softmax_label`.
 
-
-```perl
-pdl> $data  = mx->sym->Variable('data')
-pdl> $label = mx->sym->Variable('softmax_label')
-pdl> $fullc = mx->sym->FullyConnected(data=>$data, num_hidden=>1)
-pdl> $loss  = mx->sym->SoftmaxOutput(data=>$data, label=>$label)
-pdl> $mod   = mx->mod->Module($loss)
-pdl> print($mod->data_names->[0])
-data
-pdl> print($mod->label_names->[0])
-softmax_label
-pdl> $mod->bind(data_shapes=>$nd_iter->provide_data, label_shapes=>$nd_iter->provide_label)
-```
-
-Then we can call `$mod->fit($nd_iter, num_epoch=>2)` to train `loss` by 2 epochs.
-
 ## Predefined Data iterators
 
 ```perl
diff --git a/docs/static_site/src/pages/api/perl/docs/tutorials/module.md b/docs/static_site/src/pages/api/perl/docs/tutorials/module.md
deleted file mode 100644
index 006bfbdd2441..000000000000
--- a/docs/static_site/src/pages/api/perl/docs/tutorials/module.md
+++ /dev/null
@@ -1,70 +0,0 @@
----
-layout: page_api
-title: Module API
-is_tutorial: true
-tag: perl
-permalink: /api/perl/docs/tutorials/module
----
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-
-# Module API
-
-## Overview
-
-The module API, defined in the `module` (or simply `mod`) package (`AI::MXNet::Module` under the hood), provides an
-intermediate and high-level interface for performing computation with a
-`AI::MXNet::Symbol` or just `mx->sym`. One can roughly think a module is a machine which can execute a
-program defined by a `Symbol`.
-
-The class `AI::MXNet::Module` is a commonly used module, which accepts a `AI::MXNet::Symbol` as
-the input:
-
-```perl
-pdl> $data = mx->symbol->Variable('data')
-pdl> $fc1  = mx->symbol->FullyConnected($data, name=>'fc1', num_hidden=>128)
-pdl> $act1 = mx->symbol->Activation($fc1, name=>'relu1', act_type=>"relu")
-pdl> $fc2  = mx->symbol->FullyConnected($act1, name=>'fc2', num_hidden=>10)
-pdl> $out  = mx->symbol->SoftmaxOutput($fc2, name => 'softmax')
-pdl> $mod  = mx->mod->Module($out)  # create a module by given a Symbol
-```
-
-Assume there is a valid MXNet data iterator `data`. We can initialize the
-module:
-
-```perl
-pdl> $mod->bind(data_shapes=>$data->provide_data,
-         label_shapes=>$data->provide_label)  # create memory by given input shapes
-pdl> $mod->init_params()  # initial parameters with the default random initializer
-```
-
-Now the module is able to compute. We can call high-level API to train and
-predict:
-
-```perl
-pdl> $mod->fit($data, num_epoch=>10, ...)  # train
-pdl> $mod->predict($new_data)  # predict on new data
-```
-
-or use intermediate APIs to perform step-by-step computations
-
-```perl
-pdl> $mod->forward($data_batch, is_train => 1)  # forward on the provided data batch
-pdl> $mod->backward()  # backward to calculate the gradients
-pdl> $mod->update()  # update parameters using the default optimizer
-```
diff --git a/docs/static_site/src/pages/api/perl/docs/tutorials/symbol.md b/docs/static_site/src/pages/api/perl/docs/tutorials/symbol.md
index 0a70c7d280ce..ca19d10ddcd0 100644
--- a/docs/static_site/src/pages/api/perl/docs/tutorials/symbol.md
+++ b/docs/static_site/src/pages/api/perl/docs/tutorials/symbol.md
@@ -37,16 +37,6 @@ Topics:
 The symbolic API provides a way to configure computation graphs.
 You can configure the graphs either at the level of neural network layer operations or as fine-grained operations.
 
-The following example configures a two-layer neural network.
-
-```perl
-pdl> use AI::MXNet qw(mx)
-pdl> $data = mx->symbol->Variable("data")
-pdl> $fc1  = mx->symbol->FullyConnected(data => $data, name => "fc1", num_hidden => 128)
-pdl> $act1 = mx->symbol->Activation(data => $fc1, name => "relu1", act_type => "relu")
-pdl> $fc2 =  mx->symbol->FullyConnected(data => $act1, name => "fc2", num_hidden => 64)
-pdl> $net =  mx->symbol->SoftmaxOutput(data => $fc2, name => "out")
-```
 
 The basic arithmetic operators (plus, minus, div, multiplication) are overloaded for
 *element-wise operations* of symbols.
@@ -126,26 +116,3 @@ input data, and the weights of the neural network that were learned during train
 
 To manually execute a set of symbols, you need to create an [`AI::MXNet::Executor`] object,
 which is typically constructed by calling the [`simple_bind(<parameters>)`] method on a AI::MXNet::Symbol.
-
-## Multiple Outputs
-
-To group the symbols together, use the [AI::MXNet::Symbol->Group](#mxnet.symbol.Group) function.
-
-```perl
-pdl> use AI::MXNet qw(mx)
-pdl> use Data::Dumper
-pdl> $data  = mx->sym->Variable("data")
-pdl> $fc1   = mx->sym->FullyConnected($data, name => "fc1", num_hidden => 128)
-pdl> $act1  = mx->sym->Activation($fc1, name => "relu1", act_type => "relu")
-pdl> $fc2   = mx->sym->FullyConnected($act1, name => "fc2", num_hidden => 64)
-pdl> $net   = mx->sym->SoftmaxOutput($fc2, name => "softmax")
-pdl> $group = mx->sym->Group([$fc1, $net])
-pdl> print Dumper($group->list_outputs())
-$VAR1 = [
-    'fc1_output',
-    'softmax_output'
-];
-```
-
-After you get the ```Group```, you can bind on ```group``` instead.
-The resulting executor will have two outputs, one for fc1_output and one for softmax_output.
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md b/docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md
deleted file mode 100644
index d74112db98b5..000000000000
--- a/docs/static_site/src/pages/api/r/docs/tutorials/callback_function.md
+++ /dev/null
@@ -1,278 +0,0 @@
----
-layout: page_api
-title: Callback Function
-is_tutorial: true
-tag: r
-permalink: /api/r/docs/tutorials/callback_function
----
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-Callback Function
-======================================
-
-This tutorial provides guidelines for using and writing callback functions,
-which can very useful in model training.
-
-Model Training Example
-----------
-
-Let's begin with a small example. We can build and train a model with the following code:
-
-
- ```r
-    library(mxnet)
-    data(BostonHousing, package="mlbench")
-    train.ind = seq(1, 506, 3)
-    train.x = data.matrix(BostonHousing[train.ind, -14])
-    train.y = BostonHousing[train.ind, 14]
-    test.x = data.matrix(BostonHousing[-train.ind, -14])
-    test.y = BostonHousing[-train.ind, 14]
-    data <- mx.symbol.Variable("data")
-    fc1 <- mx.symbol.FullyConnected(data, num_hidden=1)
-    lro <- mx.symbol.LinearRegressionOutput(fc1)
-    mx.set.seed(0)
-    model <- mx.model.FeedForward.create(
-      lro, X=train.x, y=train.y,
-      eval.data=list(data=test.x, label=test.y),
-      ctx=mx.cpu(), num.round=10, array.batch.size=20,
-      learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse)
- ```
-
- ```
-    ## Auto detect layout of input matrix, use row major..
-    ## Start training with 1 devices
-    ## [1] Train-rmse=16.063282524034
-    ## [1] Validation-rmse=10.1766446093622
-    ## [2] Train-rmse=12.2792375712573
-    ## [2] Validation-rmse=12.4331776190813
-    ## [3] Train-rmse=11.1984634005885
-    ## [3] Validation-rmse=10.3303041888193
-    ## [4] Train-rmse=10.2645236892904
-    ## [4] Validation-rmse=8.42760407903415
-    ## [5] Train-rmse=9.49711005504284
-    ## [5] Validation-rmse=8.44557808483234
-    ## [6] Train-rmse=9.07733734175182
-    ## [6] Validation-rmse=8.33225500266177
-    ## [7] Train-rmse=9.07884450847991
-    ## [7] Validation-rmse=8.38827833418459
-    ## [8] Train-rmse=9.10463850277417
-    ## [8] Validation-rmse=8.37394452365264
-    ## [9] Train-rmse=9.03977049028532
-    ## [9] Validation-rmse=8.25927979725672
-    ## [10] Train-rmse=8.96870685004475
-    ## [10] Validation-rmse=8.19509291481822
- ```
-
-We also provide two optional parameters, `batch.end.callback` and `epoch.end.callback`, which can provide great flexibility in model training.
-
-How to Use Callback Functions
----------
-
-This package provides two callback functions:
-
-- `mx.callback.save.checkpoint` saves a checkpoint to files during each period iteration.
-
-```r
-         model <- mx.model.FeedForward.create(
-           lro, X=train.x, y=train.y,
-           eval.data=list(data=test.x, label=test.y),
-           ctx=mx.cpu(), num.round=10, array.batch.size=20,
-           learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse,
-           epoch.end.callback = mx.callback.save.checkpoint("boston"))
-```
-
-```
-          ## Auto detect layout of input matrix, use row major..
-          ## Start training with 1 devices
-          ## [1] Train-rmse=19.1621424021617
-          ## [1] Validation-rmse=20.721515592165
-          ## Model checkpoint saved to boston-0001.params
-          ## [2] Train-rmse=13.5127391952367
-          ## [2] Validation-rmse=14.1822123675007
-          ## Model checkpoint saved to boston-0002.params
-```
-
-
-- `mx.callback.log.train.metric` logs a training metric each period. You can use it either as a `batch.end.callback` or an
-`epoch.end.callback`.
-
-
-```r
-         model <- mx.model.FeedForward.create(
-           lro, X=train.x, y=train.y,
-           eval.data=list(data=test.x, label=test.y),
-           ctx=mx.cpu(), num.round=10, array.batch.size=20,
-           learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse,
-           batch.end.callback = mx.callback.log.train.metric(5))
- ```
-
-```
-         ## Auto detect layout of input matrix, use row major..
-         ## Start training with 1 devices
-         ## Batch [5] Train-rmse=17.6514558545416
-         ## [1] Train-rmse=15.2879610219001
-         ## [1] Validation-rmse=12.3332062820921
-         ## Batch [5] Train-rmse=11.939392828565
-         ## [2] Train-rmse=11.4382242547217
-         ## [2] Validation-rmse=9.91176550103181
-         ............
-```
-
-You also can save the training and evaluation errors for later use by passing a reference class:
-
-
- ```r
-    logger <- mx.metric.logger$new()
-    model <- mx.model.FeedForward.create(
-      lro, X=train.x, y=train.y,
-      eval.data=list(data=test.x, label=test.y),
-      ctx=mx.cpu(), num.round=10, array.batch.size=20,
-      learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse,
-      epoch.end.callback = mx.callback.log.train.metric(5, logger))
- ```
-
- ```
-    ## Auto detect layout of input matrix, use row major..
-    ## Start training with 1 devices
-    ## [1] Train-rmse=19.1083228733256
-    ## [1] Validation-rmse=12.7150687428974
-    ## [2] Train-rmse=15.7684378116157
-    ## [2] Validation-rmse=14.8105319420491
-    ............
- ```
-
- ```r
-    head(logger$train)
- ```
-
- ```
-    ## [1] 19.108323 15.768438 13.531470 11.386050  9.555477  9.351324
- ```
-
- ```r
-    head(logger$eval)
- ```
-
- ```
-    ## [1] 12.715069 14.810532 15.840361 10.898733  9.349706  9.363087
- ```
-
-How to Write Your Own Callback Functions
-----------
-
-You can find the source code for the two callback functions on [GitHub](https://github.com/dmlc/mxnet/blob/master/R-package/R/callback.R) and use it as a template:
-
-Basically, all callback functions follow the following structure:
-
-
- ```r
-    mx.callback.fun <- function() {
-      function(iteration, nbatch, env) {
-      }
-    }
- ```
-
-The following `mx.callback.save.checkpoint` function is stateless. It gets the model from the environment and saves it:.
-
-
- ```r
-    mx.callback.save.checkpoint <- function(prefix, period=1) {
-      function(iteration, nbatch, env) {
-      if (iteration %% period == 0) {
-      mx.model.save(env$model, prefix, iteration)
-      cat(sprintf("Model checkpoint saved to %s-%04d.params\n", prefix, iteration))
-    }
-    return(TRUE)
-      }
-    }
- ```
-
-The `mx.callback.log.train.metric` is a little more complex. It holds a reference class and updates it during the training
-process:
-
-
- ```r
-    mx.callback.log.train.metric <- function(period, logger=NULL) {
-      function(iteration, nbatch, env) {
-    if (nbatch %% period == 0 && !is.null(env$metric)) {
-      result <- env$metric$get(env$train.metric)
-      if (nbatch != 0)
-        cat(paste0("Batch [", nbatch, "] Train-", result$name, "=", result$value, "\n"))
-      if (!is.null(logger)) {
-        if (class(logger) != "mx.metric.logger") {
-          stop("Invalid mx.metric.logger.")
-        }
-        logger$train <- c(logger$train, result$value)
-        if (!is.null(env$eval.metric)) {
-          result <- env$metric$get(env$eval.metric)
-          if (nbatch != 0)
-            cat(paste0("Batch [", nbatch, "] Validation-", result$name, "=", result$value, "\n"))
-          logger$eval <- c(logger$eval, result$value)
-        }
-      }
-    }
-    return(TRUE)
-      }
-    }
- ```
-
-Now you might be curious why both callback functions `return(TRUE)`.
-
-Can we `return(FALSE)`?
-
-Yes! You can stop the training early with `return(FALSE)`. See the following examples.
-
-
-  ```r
-     mx.callback.early.stop <- function(eval.metric) {
-      function(iteration, nbatch, env) {
-    if (!is.null(env$metric)) {
-      if (!is.null(eval.metric)) {
-        result <- env$metric$get(env$eval.metric)
-        if (result$value < eval.metric) {
-          return(FALSE)
-        }
-      }
-    }
-    return(TRUE)
-      }
-    }
-    model <- mx.model.FeedForward.create(
-      lro, X=train.x, y=train.y,
-      eval.data=list(data=test.x, label=test.y),
-      ctx=mx.cpu(), num.round=10, array.batch.size=20,
-      learning.rate=2e-6, momentum=0.9, eval.metric=mx.metric.rmse,
-      epoch.end.callback = mx.callback.early.stop(10))
- ```
-
- ```
-    ## Auto detect layout of input matrix, use row major..
-    ## Start training with 1 devices
-    ## [1] Train-rmse=18.5897984387033
-    ## [1] Validation-rmse=13.5555213820571
-    ## [2] Train-rmse=12.5867564040256
-    ## [2] Validation-rmse=9.76304967080928
- ```
-
-When the validation metric dips below the threshold we set, the training process stops.
-
-## Next Steps
-* [Neural Networks with MXNet in Five Minutes](/api/r/docs/tutorials/five_minutes_neural_network)
-* [Classify Real-World Images with a Pretrained Model](/api/r/docs/tutorials/classify_real_image_with_pretrained_model)
-* [Handwritten Digits Classification Competition](/api/r/docs/tutorials/mnist_competition)
-* [Character Language Model Using RNN](/api/r/docs/tutorials/char_rnn_model)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/custom_iterator.md b/docs/static_site/src/pages/api/r/docs/tutorials/custom_iterator.md
index bb4cfc2f5c1d..e0213387124e 100644
--- a/docs/static_site/src/pages/api/r/docs/tutorials/custom_iterator.md
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/custom_iterator.md
@@ -157,67 +157,6 @@ batch.size <- 100
 train.iter <- CustomCSVIter$new(iter = NULL, data.csv = "mnist_train.csv", data.shape = 28, batch.size = batch.size)
 ```
 
-CNN Model
-----------
-
-For this tutorial we are going to use the known LeNet architecture:
-
-```r
-lenet.model <- function(){
-  data <- mx.symbol.Variable('data')
-  conv1 <- mx.symbol.Convolution(data=data, kernel=c(5,5), num_filter=20) #first conv
-  tanh1 <- mx.symbol.Activation(data=conv1, act_type="tanh")
-  pool1 <- mx.symbol.Pooling(data=tanh1, pool_type="max", kernel=c(2,2), stride=c(2,2))
-  conv2 <- mx.symbol.Convolution(data=pool1, kernel=c(5,5), num_filter=50)# second conv
-  tanh2 <- mx.symbol.Activation(data=conv2, act_type="tanh")
-  pool2 <- mx.symbol.Pooling(data=tanh2, pool_type="max", kernel=c(2,2), stride=c(2,2))
-  flatten <- mx.symbol.Flatten(data=pool2)
-  fc1 <- mx.symbol.FullyConnected(data=flatten, num_hidden=100) # first fullc
-  tanh3 <- mx.symbol.Activation(data=fc1, act_type="tanh")
-  fc2 <- mx.symbol.FullyConnected(data=tanh3, num_hidden=10) # second fullc
-  network <- mx.symbol.SoftmaxOutput(data=fc2) # loss
-  network
-}
-network <- lenet.model()
-```
-
-Training with the Custom Iterator
-----------
-Finally, we can directly add the custom iterator as the training data source.
-
-```r
-model <- mx.model.FeedForward.create(symbol=network,
-                                     X=train.iter,
-                                     ctx=mx.gpu(0),
-                                     num.round=10,
-                                     array.batch.size=batch.size,
-                                     learning.rate=0.1,
-                                     momentum=0.9,
-                                     eval.metric=mx.metric.accuracy,
-                                     wd=0.00001,
-                                     batch.end.callback=mx.callback.log.speedometer(batch.size, frequency = 100)
-                                     )
-```
-
-The last 2 iterations with a K80 GPU looks like this:
-
-```bash
-[8] Train-accuracy=0.998866666666667
-Batch [100] Speed: 15413.0104454713 samples/sec Train-accuracy=0.999
-Batch [200] Speed: 16629.3412459049 samples/sec Train-accuracy=0.99935
-Batch [300] Speed: 18412.6900509319 samples/sec Train-accuracy=0.9995
-Batch [400] Speed: 16757.2882328335 samples/sec Train-accuracy=0.999425
-Batch [500] Speed: 17116.6529207406 samples/sec Train-accuracy=0.99946
-Batch [600] Speed: 19627.589505195 samples/sec Train-accuracy=0.99945
-[9] Train-accuracy=0.9991
-Batch [100] Speed: 18971.5745536982 samples/sec Train-accuracy=0.9992
-Batch [200] Speed: 15554.8822435383 samples/sec Train-accuracy=0.99955
-Batch [300] Speed: 18327.6950115053 samples/sec Train-accuracy=0.9997
-Batch [400] Speed: 17103.0705411788 samples/sec Train-accuracy=0.9997
-Batch [500] Speed: 15104.8656902394 samples/sec Train-accuracy=0.99974
-Batch [600] Speed: 13818.7899518255 samples/sec Train-accuracy=0.99975
-[10] Train-accuracy=0.99975
-```
 
 Conclusion
 ----------
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md b/docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md
deleted file mode 100644
index a4ca967d8e2c..000000000000
--- a/docs/static_site/src/pages/api/r/docs/tutorials/custom_loss_function.md
+++ /dev/null
@@ -1,231 +0,0 @@
----
-layout: page_api
-title: Custom Loss Function
-is_tutorial: true
-tag: r
-permalink: /api/r/docs/tutorials/custom_loss_function
----
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-
-Customized loss function
-======================================
-
-This tutorial provides guidelines for using customized loss function in network construction.
-
-Model Training Example
-----------------------
-
-Let's begin with a small regression example. We can build and train a regression model with the following code:
-
-``` r
-data(BostonHousing, package = "mlbench")
-BostonHousing[, sapply(BostonHousing, is.factor)] <-
-  as.numeric(as.character(BostonHousing[, sapply(BostonHousing, is.factor)]))
-BostonHousing <- data.frame(scale(BostonHousing))
-
-test.ind = seq(1, 506, 5)    # 1 pt in 5 used for testing
-train.x = data.matrix(BostonHousing[-test.ind,-14])
-train.y = BostonHousing[-test.ind, 14]
-test.x = data.matrix(BostonHousing[--test.ind,-14])
-test.y = BostonHousing[--test.ind, 14]
-
-require(mxnet)
-```
-
-    ## Loading required package: mxnet
-
-``` r
-data <- mx.symbol.Variable("data")
-label <- mx.symbol.Variable("label")
-fc1 <- mx.symbol.FullyConnected(data, num_hidden = 14, name = "fc1")
-tanh1 <- mx.symbol.Activation(fc1, act_type = "tanh", name = "tanh1")
-fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden = 1, name = "fc2")
-lro <- mx.symbol.LinearRegressionOutput(fc2, name = "lro")
-
-mx.set.seed(0)
-model <- mx.model.FeedForward.create(lro, X = train.x, y = train.y,
-                                     ctx = mx.cpu(),
-                                     num.round = 5,
-                                     array.batch.size = 60,
-                                     optimizer = "rmsprop",
-                                     verbose = TRUE,
-                                     array.layout = "rowmajor",
-                                     batch.end.callback = NULL,
-                                     epoch.end.callback = NULL)
-```
-
-    ## Start training with 1 devices
-
-``` r
-pred <- predict(model, test.x)
-```
-
-    ## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
-
-``` r
-sum((test.y - pred[1,])^2) / length(test.y)
-```
-
-    ## [1] 0.2485236
-
-Besides the `LinearRegressionOutput`, we also provide `LogisticRegressionOutput` and `MAERegressionOutput`. However, this might not be enough for real-world models. You can provide your own loss function by using `mx.symbol.MakeLoss` when constructing the network.
-
-How to Use Your Own Loss Function
----------------------------------
-
-We still use our previous example, but this time we use `mx.symbol.MakeLoss` to minimize the `(pred-label)^2`
-
-``` r
-data <- mx.symbol.Variable("data")
-label <- mx.symbol.Variable("label")
-fc1 <- mx.symbol.FullyConnected(data, num_hidden = 14, name = "fc1")
-tanh1 <- mx.symbol.Activation(fc1, act_type = "tanh", name = "tanh1")
-fc2 <- mx.symbol.FullyConnected(tanh1, num_hidden = 1, name = "fc2")
-lro2 <- mx.symbol.MakeLoss(mx.symbol.square(mx.symbol.Reshape(fc2, shape = 0) - label), name="lro2")
-```
-
-Then we can train the network just as usual.
-
-``` r
-mx.set.seed(0)
-model2 <- mx.model.FeedForward.create(lro2, X = train.x, y = train.y,
-                                      ctx = mx.cpu(),
-                                      num.round = 5,
-                                      array.batch.size = 60,
-                                      optimizer = "rmsprop",
-                                      verbose = TRUE,
-                                      array.layout = "rowmajor",
-                                      batch.end.callback = NULL,
-                                      epoch.end.callback = NULL)
-```
-
-    ## Start training with 1 devices
-
-We should get very similar results because we are actually minimizing the same loss function. However, the result is quite different.
-
-``` r
-pred2 <- predict(model2, test.x)
-```
-
-    ## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
-
-``` r
-sum((test.y - pred2)^2) / length(test.y)
-```
-
-    ## [1] 1.234584
-
-This is because output of `mx.symbol.MakeLoss` is the gradient of loss with respect to the input data. We can get the real prediction as below.
-
-``` r
-internals = internals(model2$symbol)
-fc_symbol = internals[[match("fc2_output", outputs(internals))]]
-
-model3 <- list(symbol = fc_symbol,
-               arg.params = model2$arg.params,
-               aux.params = model2$aux.params)
-
-class(model3) <- "MXFeedForwardModel"
-
-pred3 <- predict(model3, test.x)
-```
-
-    ## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
-
-``` r
-sum((test.y - pred3[1,])^2) / length(test.y)
-```
-
-    ## [1] 0.248294
-
-We have provided many operations on the symbols. An example of `|pred-label|` can be found below.
-
-``` r
-lro_abs <- mx.symbol.MakeLoss(mx.symbol.abs(mx.symbol.Reshape(fc2, shape = 0) - label))
-mx.set.seed(0)
-model4 <- mx.model.FeedForward.create(lro_abs, X = train.x, y = train.y,
-                                      ctx = mx.cpu(),
-                                      num.round = 20,
-                                      array.batch.size = 60,
-                                      optimizer = "sgd",
-                                      learning.rate = 0.001,
-                                      verbose = TRUE,
-                                      array.layout = "rowmajor",
-                                      batch.end.callback = NULL,
-                                      epoch.end.callback = NULL)
-```
-
-    ## Start training with 1 devices
-
-``` r
-internals = internals(model4$symbol)
-fc_symbol = internals[[match("fc2_output", outputs(internals))]]
-
-model5 <- list(symbol = fc_symbol,
-               arg.params = model4$arg.params,
-               aux.params = model4$aux.params)
-
-class(model5) <- "MXFeedForwardModel"
-
-pred5 <- predict(model5, test.x)
-```
-
-    ## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
-
-``` r
-sum(abs(test.y - pred5[1,])) / length(test.y)
-```
-
-    ## [1] 0.7056902
-
-``` r
-lro_mae <- mx.symbol.MAERegressionOutput(fc2, name = "lro")
-mx.set.seed(0)
-model6 <- mx.model.FeedForward.create(lro_mae, X = train.x, y = train.y,
-                                      ctx = mx.cpu(),
-                                      num.round = 20,
-                                      array.batch.size = 60,
-                                      optimizer = "sgd",
-                                      learning.rate = 0.001,
-                                      verbose = TRUE,
-                                      array.layout = "rowmajor",
-                                      batch.end.callback = NULL,
-                                      epoch.end.callback = NULL)
-```
-
-    ## Start training with 1 devices
-
-``` r
-pred6 <- predict(model6, test.x)
-```
-
-    ## Warning in mx.model.select.layout.predict(X, model): Auto detect layout of input matrix, use rowmajor..
-
-``` r
-sum(abs(test.y - pred6[1,])) / length(test.y)
-```
-
-    ## [1] 0.7056902
-
-
-## Next Steps
-* [Neural Networks with MXNet in Five Minutes](/api/r/docs/tutorials/five_minutes_neural_network)
-* [Classify Real-World Images with a PreTrained Model](/api/r/docs/tutorials/classify_real_image_with_pretrained_model)
-* [Handwritten Digits Classification Competition](/api/r/docs/tutorials/mnist_competition)
-* [Character Language Model Using RNN](/api/r/docs/tutorials/char_rnn_model)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/five_minutes_neural_network.md b/docs/static_site/src/pages/api/r/docs/tutorials/five_minutes_neural_network.md
deleted file mode 100644
index f7121407892e..000000000000
--- a/docs/static_site/src/pages/api/r/docs/tutorials/five_minutes_neural_network.md
+++ /dev/null
@@ -1,341 +0,0 @@
----
-layout: page_api
-title: Five Minutes Neural Network
-is_tutorial: true
-tag: r
-permalink: /api/r/docs/tutorials/five_minutes_neural_network
----
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-Develop a Neural Network with MXNet in Five Minutes
-=============================================
-
-This tutorial is designed for new users of the `mxnet` package for R. It shows how to construct a neural network to do regression in 5 minutes. It shows how to perform classification and regression tasks, respectively. The data we use is in the `mlbench` package. Instructions to install R and MXNet's R package in different environments can be found [here](/get_started?version=master&platform=linux&language=r&environ=pip&processor=cpu).
-
-## Classification
-
- ```
-    ## Loading required package: mlbench
- ```
- ```r
-    if (!require(mlbench)) {
-      install.packages('mlbench')
-    }
- ```
-
- ```
-    ## Loading required package: mxnet
- ```
-
- ```r
-    require(mxnet)
- ```
-
- ```
-    ## Loading required datasets
- ```
-
- ```r
-    data(Sonar, package="mlbench")
-
-    Sonar[,61] = as.numeric(Sonar[,61])-1
-    train.ind = c(1:50, 100:150)
-    train.x = data.matrix(Sonar[train.ind, 1:60])
-    train.y = Sonar[train.ind, 61]
-    test.x = data.matrix(Sonar[-train.ind, 1:60])
-    test.y = Sonar[-train.ind, 61]
- ```
-
-We are going to use a multi-layer perceptron as our classifier. In `mxnet`, we have a function called `mx.mlp` for building a general multi-layer neural network to do classification or regression.
-
-`mx.mlp` requires the following parameters:
-
-- Training data and label
-- Number of hidden nodes in each hidden layer
-- Number of nodes in the output layer
-- Type of the activation
-- Type of the output loss
-- The device to train (GPU or CPU)
-- Other parameters for `mx.model.FeedForward.create`
-
-The following code shows an example usage of `mx.mlp`:
-
-
- ```r
-    mx.set.seed(0)
-    model <- mx.mlp(train.x, train.y, hidden_node=10, out_node=2, out_activation="softmax",
-                num.round=20, array.batch.size=15, learning.rate=0.07, momentum=0.9,
-                eval.metric=mx.metric.accuracy)
- ```
-
- ```
-    ## Auto detect layout of input matrix, use rowmajor..
-    ## Start training with 1 devices
-    ## [1] Train-accuracy=0.488888888888889
-    ## [2] Train-accuracy=0.514285714285714
-    ## [3] Train-accuracy=0.514285714285714
-    ## [4] Train-accuracy=0.514285714285714
-    ## [5] Train-accuracy=0.514285714285714
-    ## [6] Train-accuracy=0.523809523809524
-    ## [7] Train-accuracy=0.619047619047619
-    ## [8] Train-accuracy=0.695238095238095
-    ## [9] Train-accuracy=0.695238095238095
-    ## [10] Train-accuracy=0.761904761904762
-    ## [11] Train-accuracy=0.828571428571429
-    ## [12] Train-accuracy=0.771428571428571
-    ## [13] Train-accuracy=0.742857142857143
-    ## [14] Train-accuracy=0.733333333333333
-    ## [15] Train-accuracy=0.771428571428571
-    ## [16] Train-accuracy=0.847619047619048
-    ## [17] Train-accuracy=0.857142857142857
-    ## [18] Train-accuracy=0.838095238095238
-    ## [19] Train-accuracy=0.838095238095238
-    ## [20] Train-accuracy=0.838095238095238
- ```
-
-Note that `mx.set.seed` controls the random process in `mxnet`. You can see the accuracy in each round during training. It's also easy to make predictions and evaluate.
-
-To get an idea of what is happening, view the computation graph from R:
-
- ```r
-    graph.viz(model$symbol)
- ```
-
-[<img src="https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/knitr/graph.computation.png">](https://github.com/dmlc/mxnet)
-
- ```r
-    preds = predict(model, test.x)
- ```
-
- ```
-    ## Auto detect layout of input matrix, use rowmajor.
- ```
-
- ```r
-    pred.label = max.col(t(preds))-1
-    table(pred.label, test.y)
- ```
-
- ```
-    ##           test.y
-    ## pred.label  0  1
-    ##          0 24 14
-    ##          1 36 33
- ```
-
-Note for that for multi-class predictions, mxnet outputs `nclass` x `nexamples`, with each row corresponding to the probability of the class.
-
-## Regression
-
-Again, let us preprocess the data:
-
-
- ```r
-    data(BostonHousing, package="mlbench")
-
-    train.ind = seq(1, 506, 3)
-    train.x = data.matrix(BostonHousing[train.ind, -14])
-    train.y = BostonHousing[train.ind, 14]
-    test.x = data.matrix(BostonHousing[-train.ind, -14])
-    test.y = BostonHousing[-train.ind, 14]
- ```
-
-Although we can use `mx.mlp` again to do regression by changing the `out_activation`, this time we are going to introduce a flexible way to configure neural networks in `mxnet`. Configuration is done by the "Symbol" system in `mxnet`. The Symbol system takes care of the links among nodes, activation, dropout ratio, etc. Configure a multi-layer neural network as follows:
-
-
- ```r
-    # Define the input data
-    data <- mx.symbol.Variable("data")
-    # A fully connected hidden layer
-    # data: input source
-    # num_hidden: number of neurons in this hidden layer
-    fc1 <- mx.symbol.FullyConnected(data, num_hidden=1)
-
-    # Use linear regression for the output layer
-    lro <- mx.symbol.LinearRegressionOutput(fc1)
- ```
-
-What matters for a regression task is mainly the last function. It enables the new network to optimize for squared loss. Now let's train on this simple data set. In this configuration, we dropped the hidden layer so that the input layer is directly connected to the output layer.
-
-Next, make prediction with this structure and other parameters with `mx.model.FeedForward.create`:
-
-
- ```r
-    mx.set.seed(0)
-    model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y,
-                                         ctx=mx.cpu(),     num.round=50, array.batch.size=20,
-                                         learning.rate=2e-6, momentum=0.9,  eval.metric=mx.metric.rmse)
- ```
-
- ```
-    ## Auto detect layout of input matrix, use rowmajor.
-    ## Start training with 1 devices
-    ## [1] Train-rmse=16.063282524034
-    ## [2] Train-rmse=12.2792375712573
-    ## [3] Train-rmse=11.1984634005885
-    ## [4] Train-rmse=10.2645236892904
-    ## [5] Train-rmse=9.49711005504284
-    ## [6] Train-rmse=9.07733734175182
-    ## [7] Train-rmse=9.07884450847991
-    ## [8] Train-rmse=9.10463850277417
-    ## [9] Train-rmse=9.03977049028532
-    ## [10] Train-rmse=8.96870685004475
-    ## [11] Train-rmse=8.93113287361574
-    ## [12] Train-rmse=8.89937257821847
-    ## [13] Train-rmse=8.87182096922953
-    ## [14] Train-rmse=8.84476075083586
-    ## [15] Train-rmse=8.81464673014974
-    ## [16] Train-rmse=8.78672567900196
-    ## [17] Train-rmse=8.76265872846474
-    ## [18] Train-rmse=8.73946101419974
-    ## [19] Train-rmse=8.71651926303267
-    ## [20] Train-rmse=8.69457600919277
-    ## [21] Train-rmse=8.67354928674563
-    ## [22] Train-rmse=8.65328755392436
-    ## [23] Train-rmse=8.63378039680078
-    ## [24] Train-rmse=8.61488162586984
-    ## [25] Train-rmse=8.5965105183022
-    ## [26] Train-rmse=8.57868133563275
-    ## [27] Train-rmse=8.56135851937663
-    ## [28] Train-rmse=8.5444819772098
-    ## [29] Train-rmse=8.52802114610432
-    ## [30] Train-rmse=8.5119504512622
-    ## [31] Train-rmse=8.49624261719241
-    ## [32] Train-rmse=8.48087453238701
-    ## [33] Train-rmse=8.46582689119887
-    ## [34] Train-rmse=8.45107881002491
-    ## [35] Train-rmse=8.43661331401712
-    ## [36] Train-rmse=8.42241575909639
-    ## [37] Train-rmse=8.40847217331365
-    ## [38] Train-rmse=8.39476931796395
-    ## [39] Train-rmse=8.38129658373974
-    ## [40] Train-rmse=8.36804269059018
-    ## [41] Train-rmse=8.35499817678397
-    ## [42] Train-rmse=8.34215505742154
-    ## [43] Train-rmse=8.32950441908131
-    ## [44] Train-rmse=8.31703985777311
-    ## [45] Train-rmse=8.30475363906755
-    ## [46] Train-rmse=8.29264031506106
-    ## [47] Train-rmse=8.28069372820073
-    ## [48] Train-rmse=8.26890902770415
-    ## [49] Train-rmse=8.25728089053853
-    ## [50] Train-rmse=8.24580511500735
- ```
-
-It's also easy to make a prediction and evaluate it:
-
-
- ```r
-    preds = predict(model, test.x)
- ```
-
- ```
-    ## Auto detect layout of input matrix, use rowmajor..
- ```
-
- ```r
-    sqrt(mean((preds-test.y)^2))
- ```
-
- ```
-    ## [1] 7.800502
- ```
-
-Currently, we have four predefined metrics: "accuracy", "rmse", "mae", and "rmsle". MXNet provides the interface for defining your own metrics:
-
-
- ```r
-    demo.metric.mae <- mx.metric.custom("mae", function(label, pred) {
-      pred <- mx.nd.reshape(pred, shape = 0)
-      res <- mx.nd.mean(mx.nd.abs(label-pred))
-      return(res)
-    })
- ```
-
-This is an example of the mean absolute error metric. Simply plug it into the training function:
-
-
- ```r
-    mx.set.seed(0)
-    model <- mx.model.FeedForward.create(lro, X=train.x, y=train.y,
-                                         ctx=mx.cpu(),    num.round=50, array.batch.size=20,
-                                         learning.rate=2e-6, momentum=0.9, eval.metric=demo.metric.mae)
- ```
-
- ```
-    ## Auto detect layout of input matrix, use rowmajor.
-    ## Start training with 1 devices
-    ## [1] Train-mae=14.953625731998
-    ## [2] Train-mae=11.4802955521478
-    ## [3] Train-mae=8.50700579749213
-    ## [4] Train-mae=7.30591265360514
-    ## [5] Train-mae=7.38049803839789
-    ## [6] Train-mae=7.36036252975464
-    ## [7] Train-mae=7.06519222259521
-    ## [8] Train-mae=6.9962231847975
-    ## [9] Train-mae=6.96296903822157
-    ## [10] Train-mae=6.9046172036065
-    ## [11] Train-mae=6.87867620256212
-    ## [12] Train-mae=6.85872554779053
-    ## [13] Train-mae=6.81936407089233
-    ## [14] Train-mae=6.79135354359945
-    ## [15] Train-mae=6.77438741260105
-    ## [16] Train-mae=6.75365140702989
-    ## [17] Train-mae=6.73369296391805
-    ## [18] Train-mae=6.71600982877943
-    ## [19] Train-mae=6.69932826360067
-    ## [20] Train-mae=6.6852519777086
-    ## [21] Train-mae=6.67343420452542
-    ## [22] Train-mae=6.66315894656711
-    ## [23] Train-mae=6.65314838621351
-    ## [24] Train-mae=6.64388704299927
-    ## [25] Train-mae=6.63480265935262
-    ## [26] Train-mae=6.62583245171441
-    ## [27] Train-mae=6.61697626113892
-    ## [28] Train-mae=6.60842116673787
-    ## [29] Train-mae=6.60040124257406
-    ## [30] Train-mae=6.59264140658908
-    ## [31] Train-mae=6.58551020092434
-    ## [32] Train-mae=6.57864215638902
-    ## [33] Train-mae=6.57178926467896
-    ## [34] Train-mae=6.56495311525133
-    ## [35] Train-mae=6.55813185373942
-    ## [36] Train-mae=6.5513252152337
-    ## [37] Train-mae=6.54453214009603
-    ## [38] Train-mae=6.53775374094645
-    ## [39] Train-mae=6.53098879920112
-    ## [40] Train-mae=6.52423816257053
-    ## [41] Train-mae=6.51764053768582
-    ## [42] Train-mae=6.51121346155802
-    ## [43] Train-mae=6.5047902001275
-    ## [44] Train-mae=6.49837123023139
-    ## [45] Train-mae=6.49216641320123
-    ## [46] Train-mae=6.48598252402412
-    ## [47] Train-mae=6.4798010720147
-    ## [48] Train-mae=6.47362396452162
-    ## [49] Train-mae=6.46745183732775
-    ## [50] Train-mae=6.46128723356459
- ```
-
-Congratulations! You've learned the basics for using MXNet in R. To learn how to use MXNet's advanced features, see the other tutorials.
-
-
-## Next Steps
-* [Classify Real-World Images with Pre-trained Model](https://mxnet.io/tutorials/r/classifyRealImageWithPretrainedModel.html)
-* [Handwritten Digits Classification Competition](https://mxnet.io/tutorials/r/mnistCompetition.html)
-* [Character Language Model using RNN](https://mxnet.io/tutorials/r/charRnnModel.html)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/mnist_competition.md b/docs/static_site/src/pages/api/r/docs/tutorials/mnist_competition.md
deleted file mode 100644
index e34819dd9ed7..000000000000
--- a/docs/static_site/src/pages/api/r/docs/tutorials/mnist_competition.md
+++ /dev/null
@@ -1,363 +0,0 @@
----
-layout: page_api
-title: MNIST Competition
-is_tutorial: true
-tag: r
-permalink: /api/r/docs/tutorials/mnist_competition
----
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-Handwritten Digits Classification Competition
-=============================================
-
-[MNIST](http://yann.lecun.com/exdb/mnist/) is a handwritten digits image data set created by Yann LeCun. Every digit is represented by a 28 x 28 pixel image. It's become a standard data set for testing classifiers on simple image input. A neural network is a strong model for image classification tasks. There's a [long-term hosted competition](https://www.kaggle.com/c/digit-recognizer) on Kaggle using this data set.
-This tutorial shows how to use [MXNet](https://github.com/dmlc/mxnet/tree/master/R-package) to compete in this challenge.
-
-## Loading the Data
-
-First, let's download the data from [Kaggle](https://www.kaggle.com/c/digit-recognizer/data) and put it in the `data/` folder in your working directory.
-
-Now we can read it in R and convert it to matrices:
-
-
- ```r
-    require(mxnet)
- ```
-
- ```
-    ## Loading required package: mxnet
-    ## Loading required package: methods
- ```
-
- ```r
-    train <- read.csv('data/train.csv', header=TRUE)
-    test <- read.csv('data/test.csv', header=TRUE)
-    train <- data.matrix(train)
-    test <- data.matrix(test)
-
-    train.x <- train[,-1]
-    train.y <- train[,1]
- ```
-
-Every image is represented as a single row in train/test. The greyscale of each image falls in the range [0, 255]. Linearly transform it into [0,1] by using the following command:
-
-
- ```r
-    train.x <- t(train.x/255)
-    test <- t(test/255)
- ```
-Transpose the input matrix to npixel x nexamples, which is the major format for columns accepted by MXNet (and the convention of R).
-
-In the label section, the number of each digit is fairly evenly distributed:
-
-
- ```r
-    table(train.y)
- ```
-
- ```
-    ## train.y
-    ##    0    1    2    3    4    5    6    7    8    9
-    ## 4132 4684 4177 4351 4072 3795 4137 4401 4063 4188
-```
-
-## Configuring the Network
-
-Now that we have the data, let's configure the structure of our network:
-
-
- ```r
-    data <- mx.symbol.Variable("data")
-    fc1 <- mx.symbol.FullyConnected(data, name="fc1", num_hidden=128)
-    act1 <- mx.symbol.Activation(fc1, name="relu1", act_type="relu")
-    fc2 <- mx.symbol.FullyConnected(act1, name="fc2", num_hidden=64)
-    act2 <- mx.symbol.Activation(fc2, name="relu2", act_type="relu")
-    fc3 <- mx.symbol.FullyConnected(act2, name="fc3", num_hidden=10)
-    softmax <- mx.symbol.SoftmaxOutput(fc3, name="sm")
- ```
-
-1. In `mxnet`, we use the data type `symbol` to configure the network. `data <- mx.symbol.Variable("data")` uses `data` to represent the input data, i.e., the input layer.
-2. We set the first hidden layer with `fc1 <- mx.symbol.FullyConnected(data, name="fc1", num_hidden=128)`. This layer has `data` as the input, its name, and the number of hidden neurons.
-3. Activation is set with `act1 <- mx.symbol.Activation(fc1, name="relu1", act_type="relu")`. The activation function takes the output from the first hidden layer, `fc1`.
-4. The second hidden layer takes the result from `act1` as input, with its name as "fc2" and the number of hidden neurons as 64.
-5. The second activation is almost the same as `act1`, except we have a different input source and name.
-6. This generates the output layer. Because there are only 10 digits, we set the number of neurons to 10.
-7. Finally, we set the activation to softmax to get a probabilistic prediction.
-
-## Training
-
-We are almost ready for the training process. Before we start the computation, let's decide which device to use:
-
-
- ```r
-    devices <- mx.cpu()
- ```
-
-We assign CPU to `mxnet`. Now, you can run the following command to train the neural network! Note that `mx.set.seed` is the function that controls the random process in `mxnet`:
-
-
- ```r
-    mx.set.seed(0)
-    model <- mx.model.FeedForward.create(softmax, X=train.x, y=train.y,
-                                         ctx=devices, num.round=10, array.batch.size=100,
-                                         learning.rate=0.07, momentum=0.9,  eval.metric=mx.metric.accuracy,
-                                         initializer=mx.init.uniform(0.07),
-                                            epoch.end.callback=mx.callback.log.train.metric(100))
- ```
-
- ```
-    ## Start training with 1 devices
-    ## Batch [100] Train-accuracy=0.6563
-    ## Batch [200] Train-accuracy=0.777999999999999
-    ## Batch [300] Train-accuracy=0.827466666666665
-    ## Batch [400] Train-accuracy=0.855499999999999
-    ## [1] Train-accuracy=0.859832935560859
-    ## Batch [100] Train-accuracy=0.9529
-    ## Batch [200] Train-accuracy=0.953049999999999
-    ## Batch [300] Train-accuracy=0.955866666666666
-    ## Batch [400] Train-accuracy=0.957525000000001
-    ## [2] Train-accuracy=0.958309523809525
-    ## Batch [100] Train-accuracy=0.968
-    ## Batch [200] Train-accuracy=0.9677
-    ## Batch [300] Train-accuracy=0.9696
-    ## Batch [400] Train-accuracy=0.970650000000002
-    ## [3] Train-accuracy=0.970809523809526
-    ## Batch [100] Train-accuracy=0.973
-    ## Batch [200] Train-accuracy=0.974249999999999
-    ## Batch [300] Train-accuracy=0.976
-    ## Batch [400] Train-accuracy=0.977100000000003
-    ## [4] Train-accuracy=0.977452380952384
-    ## Batch [100] Train-accuracy=0.9834
-    ## Batch [200] Train-accuracy=0.981949999999999
-    ## Batch [300] Train-accuracy=0.981900000000001
-    ## Batch [400] Train-accuracy=0.982600000000003
-    ## [5] Train-accuracy=0.983000000000003
-    ## Batch [100] Train-accuracy=0.983399999999999
-    ## Batch [200] Train-accuracy=0.98405
-    ## Batch [300] Train-accuracy=0.985000000000001
-    ## Batch [400] Train-accuracy=0.985725000000003
-    ## [6] Train-accuracy=0.985952380952384
-    ## Batch [100] Train-accuracy=0.988999999999999
-    ## Batch [200] Train-accuracy=0.9876
-    ## Batch [300] Train-accuracy=0.988100000000001
-    ## Batch [400] Train-accuracy=0.988750000000003
-    ## [7] Train-accuracy=0.988880952380955
-    ## Batch [100] Train-accuracy=0.991999999999999
-    ## Batch [200] Train-accuracy=0.9912
-    ## Batch [300] Train-accuracy=0.990066666666668
-    ## Batch [400] Train-accuracy=0.990275000000003
-    ## [8] Train-accuracy=0.990452380952384
-    ## Batch [100] Train-accuracy=0.9937
-    ## Batch [200] Train-accuracy=0.99235
-    ## Batch [300] Train-accuracy=0.991966666666668
-    ## Batch [400] Train-accuracy=0.991425000000003
-    ## [9] Train-accuracy=0.991500000000003
-    ## Batch [100] Train-accuracy=0.9942
-    ## Batch [200] Train-accuracy=0.99245
-    ## Batch [300] Train-accuracy=0.992433333333334
-    ## Batch [400] Train-accuracy=0.992275000000002
-    ## [10] Train-accuracy=0.992380952380955
- ```
-
-## Making a Prediction and Submitting to the Competition
-
-To make a prediction, type:
-
-
- ```r
-    preds <- predict(model, test)
-    dim(preds)
- ```
-
- ```
-    ## [1]    10 28000
- ```
-
-It is a matrix with 28000 rows and 10 cols, containing the desired classification probabilities from the output layer. To extract the maximum label for each row, use `max.col`:
-
-
- ```r
-    pred.label <- max.col(t(preds)) - 1
-    table(pred.label)
- ```
-
- ```
-    ## pred.label
-    ##    0    1    2    3    4    5    6    7    8    9
-    ## 2818 3195 2744 2767 2683 2596 2798 2790 2784 2825
- ```
-
-With a little extra effort to modify the .csv format, our submission is ready for the competition!
-
-
- ```r
-    submission <- data.frame(ImageId=1:ncol(test), Label=pred.label)
-    write.csv(submission, file='submission.csv', row.names=FALSE,  quote=FALSE)
- ```
-
-## LeNet
-
-Now let's use a new network structure: [LeNet](http://yann.lecun.com/exdb/lenet/). It has been proposed by Yann LeCun for recognizing handwritten digits. We'll demonstrate how to construct and train a LeNet in `mxnet`.
-
-First, we construct the network:
-
-
-```r
-# input
-data <- mx.symbol.Variable('data')
-# first conv
-conv1 <- mx.symbol.Convolution(data=data, kernel=c(5,5), num_filter=20)
-tanh1 <- mx.symbol.Activation(data=conv1, act_type="tanh")
-pool1 <- mx.symbol.Pooling(data=tanh1, pool_type="max",
-                          kernel=c(2,2), stride=c(2,2))
-# second conv
-conv2 <- mx.symbol.Convolution(data=pool1, kernel=c(5,5), num_filter=50)
-tanh2 <- mx.symbol.Activation(data=conv2, act_type="tanh")
-pool2 <- mx.symbol.Pooling(data=tanh2, pool_type="max",
-                          kernel=c(2,2), stride=c(2,2))
-# first fullc
-flatten <- mx.symbol.Flatten(data=pool2)
-fc1 <- mx.symbol.FullyConnected(data=flatten, num_hidden=500)
-tanh3 <- mx.symbol.Activation(data=fc1, act_type="tanh")
-# second fullc
-fc2 <- mx.symbol.FullyConnected(data=tanh3, num_hidden=10)
-# loss
-lenet <- mx.symbol.SoftmaxOutput(data=fc2)
-```
-
-Then let's reshape the matrices into arrays:
-
-
-```r
-train.array <- train.x
-dim(train.array) <- c(28, 28, 1, ncol(train.x))
-test.array <- test
-dim(test.array) <- c(28, 28, 1, ncol(test))
-```
-
-We want to compare training speed on different devices, so define the devices:
-
-
-```r
-n.gpu <- 1
-device.cpu <- mx.cpu()
-device.gpu <- lapply(0:(n.gpu-1), function(i) {
-  mx.gpu(i)
-})
-```
-
-We can pass a list of devices to ask MXNet to train on multiple GPUs (you can do this for CPUs,
-but because internal computation of CPUs is already multi-threaded, there is less gain than with using GPUs).
-
-Start by training on the CPU first. Because this takes a bit time, we run it for just one iteration.
-
-
- ```r
-    mx.set.seed(0)
-    tic <- proc.time()
-    model <- mx.model.FeedForward.create(lenet, X=train.array, y=train.y,
-                                     ctx=device.cpu, num.round=1, array.batch.size=100,
-                                     learning.rate=0.05, momentum=0.9, wd=0.00001,
-                                     eval.metric=mx.metric.accuracy,
-                                       epoch.end.callback=mx.callback.log.train.metric(100))
- ```
-
- ```
-    ## Start training with 1 devices
-    ## Batch [100] Train-accuracy=0.1066
-    ## Batch [200] Train-accuracy=0.16495
-    ## Batch [300] Train-accuracy=0.401766666666667
-    ## Batch [400] Train-accuracy=0.537675
-    ## [1] Train-accuracy=0.557136038186157
- ```
-
- ```r
-    print(proc.time() - tic)
- ```
-
- ```
-    ##    user  system elapsed
-    ## 130.030 204.976  83.821
- ```
-
-Train on a GPU:
-
-
- ```r
-    mx.set.seed(0)
-    tic <- proc.time()
-    model <- mx.model.FeedForward.create(lenet, X=train.array, y=train.y,
-                                     ctx=device.gpu, num.round=5, array.batch.size=100,
-                                     learning.rate=0.05, momentum=0.9, wd=0.00001,
-                                     eval.metric=mx.metric.accuracy,
-                                       epoch.end.callback=mx.callback.log.train.metric(100))
- ```
-
- ```
-    ## Start training with 1 devices
-    ## Batch [100] Train-accuracy=0.1066
-    ## Batch [200] Train-accuracy=0.1596
-    ## Batch [300] Train-accuracy=0.3983
-    ## Batch [400] Train-accuracy=0.533975
-    ## [1] Train-accuracy=0.553532219570405
-    ## Batch [100] Train-accuracy=0.958
-    ## Batch [200] Train-accuracy=0.96155
-    ## Batch [300] Train-accuracy=0.966100000000001
-    ## Batch [400] Train-accuracy=0.968550000000003
-    ## [2] Train-accuracy=0.969071428571432
-    ## Batch [100] Train-accuracy=0.977
-    ## Batch [200] Train-accuracy=0.97715
-    ## Batch [300] Train-accuracy=0.979566666666668
-    ## Batch [400] Train-accuracy=0.980900000000003
-    ## [3] Train-accuracy=0.981309523809527
-    ## Batch [100] Train-accuracy=0.9853
-    ## Batch [200] Train-accuracy=0.985899999999999
-    ## Batch [300] Train-accuracy=0.986966666666668
-    ## Batch [400] Train-accuracy=0.988150000000002
-    ## [4] Train-accuracy=0.988452380952384
-    ## Batch [100] Train-accuracy=0.990199999999999
-    ## Batch [200] Train-accuracy=0.98995
-    ## Batch [300] Train-accuracy=0.990600000000001
-    ## Batch [400] Train-accuracy=0.991325000000002
-    ## [5] Train-accuracy=0.991523809523812
- ```
-
- ```r
-    print(proc.time() - tic)
- ```
-
- ```
-    ##    user  system elapsed
-    ##   9.288   1.680   6.889
- ```
-
-By using a GPU processor, we significantly speed up training!
-Now, we can submit the result to Kaggle to see the improvement of our ranking!
-
-
- ```r
-    preds <- predict(model, test.array)
-    pred.label <- max.col(t(preds)) - 1
-    submission <- data.frame(ImageId=1:ncol(test), Label=pred.label)
-    write.csv(submission, file='submission.csv', row.names=FALSE, quote=FALSE)
- ```
-
-![](https://raw.githubusercontent.com/dmlc/web-data/master/mxnet/knitr/mnistCompetition-kaggle-submission.png)
-
-##  Next Steps
-* [Character Language Model using RNN](https://mxnet.io/tutorials/r/charRnnModel.html)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md b/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md
index dc3d1c5a028e..115203201977 100644
--- a/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/ndarray.md
@@ -224,8 +224,5 @@ the results.
 
 ## Next Steps
 * [Symbol](/api/r/docs/tutorials/symbol)
-* [Write and use callback functions](/api/r/docs/tutorials/callback_function)
-* [Neural Networks with MXNet in Five Minutes](/api/r/docs/tutorials/five_minutes_neural_network)
 * [Classify Real-World Images with Pre-trained Model](/api/r/docs/tutorials/classify_real_image_with_pretrained_model)
-* [Handwritten Digits Classification Competition](/api/r/docs/tutorials/mnist_competition)
 * [Character Language Model using RNN](/api/r/docs/tutorials/char_rnn_model)
diff --git a/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md b/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md
index 5827d6f9b50c..a02cc70bab72 100644
--- a/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md
+++ b/docs/static_site/src/pages/api/r/docs/tutorials/symbol.md
@@ -30,18 +30,8 @@ The computational unit `NDArray` requires a way to construct neural networks. MX
 
 The following code creates a two-layer perceptron network:
 
-
-```r
-require(mxnet)
-net <- mx.symbol.Variable("data")
-net <- mx.symbol.FullyConnected(data=net, name="fc1", num_hidden=128)
-net <- mx.symbol.Activation(data=net, name="relu1", act_type="relu")
-net <- mx.symbol.FullyConnected(data=net, name="fc2", num_hidden=64)
-net <- mx.symbol.Softmax(data=net, name="out")
-class(net)
-```
-
 ```
+require(mxnet)
 ## [1] "Rcpp_MXSymbol"
 ## attr(,"package")
 ## [1] "mxnet"
@@ -52,28 +42,7 @@ or free variables. Other symbols take a symbol as the input (*data*),
 and may accept other hyper parameters, such as the number of hidden neurons (*num_hidden*)
 or the activation type (*act_type*).
 
-A symbol can be viewed as a function that takes several arguments, whose
-names are automatically generated and can be retrieved with the following command:
-
-
-```r
-arguments(net)
-```
-
-```
-## [1] "data"       "fc1_weight" "fc1_bias"   "fc2_weight" "fc2_bias"
-## [6] "out_label"
-```
-
-The arguments are the parameters need by each symbol:
-
-- *data*: Input data needed by the variable *data*
-- *fc1_weight* and *fc1_bias*: The weight and bias for the first fully connected layer, *fc1*
-- *fc2_weight* and *fc2_bias*: The weight and bias for the second fully connected layer, *fc2*
-- *out_label*: The label needed by the loss
-
-We can also specify the automatically generated names explicitly:
-
+We can also specify the names explicitly:
 
 ```r
 data <- mx.symbol.Variable("data")
@@ -147,8 +116,5 @@ be more memory efficient than CXXNet and gets to the same runtime with
 greater flexibility.
 
 ## Next Steps
-* [Write and use callback functions](/api/r/docs/tutorials/callback_function)
-* [Neural Networks with MXNet in Five Minutes](/api/r/docs/tutorials/five_minutes_neural_network)
 * [Classify Real-World Images with Pre-trained Model](/api/r/docs/tutorials/classify_real_image_with_pretrained_model)
-* [Handwritten Digits Classification Competition](/api/r/docs/tutorials/mnist_competition)
 * [Character Language Model using RNN](/api/r/docs/tutorials/char_rnn_model)
diff --git a/docs/static_site/src/pages/api/scala/docs/tutorials/char_lstm.md b/docs/static_site/src/pages/api/scala/docs/tutorials/char_lstm.md
deleted file mode 100644
index bb23fd00a18d..000000000000
--- a/docs/static_site/src/pages/api/scala/docs/tutorials/char_lstm.md
+++ /dev/null
@@ -1,530 +0,0 @@
----
-layout: page_api
-title: Char-LSTM
-is_tutorial: true
-tag: scala
-permalink: /api/scala/docs/tutorials/char_lstm
----
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# Developing a Character-level Language model
-
-This tutorial shows how to train a character-level language model with a multilayer recurrent neural network (RNN) using Scala. This model takes one text file as input and trains an RNN that learns to predict the next character in the sequence. In this tutorial, you train a multilayer LSTM (Long Short-Term Memory) network that generates relevant text using Barack Obama's speech patterns.
-
-There are many documents that explain LSTM concepts. If you aren't familiar with LSTM, refer to the following before you proceed:
-- Christopher Olah's [Understanding LSTM blog post](https://colah.github.io/posts/2015-08-Understanding-LSTMs/)
-- [Training a LSTM char-rnn in Julia to Generate Random Sentences](http://dmlc.ml/mxnet/2015/11/15/char-lstm-in-julia.html)
-- [Bucketing in MXNet in Python](https://github.com/dmlc/mxnet-notebooks/blob/master/python/tutorials/char_lstm.ipynb)
-- [Bucketing in MXNet](https://mxnet.io/faq/bucketing.html)
-
-## How to Use This Tutorial
-
-There are three ways to use this tutorial:
-
-1) Run it by copying the provided code snippets and pasting them into the Scala command line, making the appropriate changes to the input file path.
-
-2) Reuse the code by making changes to relevant parameters and running it from command line.
-
-3) [Run the source code directly](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn) by running the [provided scripts](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/scripts/rnn).
-
-To run the scripts:
-- Build and train the model with the [run_train_charrnn.sh script](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/scripts/rnn/run_train_charrnn.sh). Edit the script as follows:
-
-Edit the CLASS_PATH variable in the script to include your operating system-specific folder (e.g., linux-x86_64-cpu/linux-x86_64-gpu/osx-x86_64-cpu) in the path. Run the script with the following command:
-
-```bash
-
-    bash run_train_charrnn.sh <which GPU card to use; -1 means CPU> <input data path> <location to save the model>
-
-    e.g.,
-    bash run_train_charrnn.sh -1 ./datas/obama.txt ./models/obama
-
-```
-
-- Run inference with the [run_test_charrnn.sh script](https://github.com/dmlc/mxnet/blob/master/scala-package/examples/scripts/rnn/run_test_charrnn.sh). Edit the script as follows:
-
-Edit the CLASS_PATH variable in the script to include your operating system-specific folder (e.g., linux-x86_64-cpu/linux-x86_64-gpu/osx-x86_64-cpu) in the path. Run the script with the following command:
-
-```bash
-
-    bash run_test_charrnn.sh <input data path> <trained model from previous script>
-
-    e.g.,
-    bash run_test_charrnn.sh ./datas/obama.txt ./models/obama
-```
-
-In this tutorial, you will accomplish the following:
-
--	Build an LSTM network that learns speech patterns from Barack Obama's speeches at the character level. At each time interval, the input is a character.
--	Clean up the dataset.
--	Train a model.
--	Fit the model.
--	Build the inference model.
-
-## Prerequisites
-
-To complete this tutorial, setup and run the scala interpreter by following the [instructions]({{'/get_started/scala_setup#interpreter'|relative_url}}).
-
-## Download the Data
-
-First, download the data, which contains Barack Obama's speeches. The data is stored in a file called obama.txt and is available on [mxnet.io](http://data.mxnet.io/data/char_lstm.zip)
-
-To download the data which contains Barack Obama's speeches:
-
-1) Download the dataset with the following command:
-
-    ```bash
-        wget http://data.mxnet.io/data/char_lstm.zip
-    ```
-
-2) Unzip the dataset with the following command:
-
-    ```bash
-        unzip char_lstm.zip -d char_lstm/
-    ```
-
-3) The downloaded data contains President Obama's speeches. You can have sneak peek at the dataset with the following command:
-
-    ```bash
-        head -10 obama.txt
-    ```
-
-Output:
-```
-        Call to Renewal Keynote Address Call to Renewal Pt 1Call to Renewal Part 2 TOPIC: Our Past, Our Future & Vision for America June
-        28, 2006 Call to Renewal' Keynote Address Complete Text Good morning. I appreciate the opportunity to speak here at the Call to R
-        enewal's Building a Covenant for a New America conference. I've had the opportunity to take a look at your Covenant for a New Ame
-        rica. It is filled with outstanding policies and prescriptions for much of what ails this country. So I'd like to congratulate yo
-        u all on the thoughtful presentations you've given so far about poverty and justice in America, and for putting fire under the fe
-        et of the political leadership here in Washington.But today I'd like to talk about the connection between religion and politics a
-        nd perhaps offer some thoughts about how we can sort through some of the often bitter arguments that we've been seeing over the l
-        ast several years.I do so because, as you all know, we can affirm the importance of poverty in the Bible; and we can raise up and
-         pass out this Covenant for a New America. We can talk to the press, and we can discuss the religious call to address poverty and
-         environmental stewardship all we want, but it won't have an impact unless we tackle head-on the mutual suspicion that sometimes
-```
-
-## Prepare the Data
-
-To preprocess the dataset, define the following utility functions:
-
-* `readContent` - Reads data from the data file.
-* `buildVocab` - Maps each character to a unique Integer ID, i.e., a build a vocabulary
-* `text2Id` - Encodes each sentence with an Integer ID.
-
-Then, use these utility functions to generate vocabulary from the input text file (obama.txt).
-
-To prepare the data:
-
-1) Read the dataset with the following function:
-
-```scala
-scala> import scala.io.Source
-
-import scala.io.Source
-
-scala> // Read file
-scala> def readContent(path: String): String = Source.fromFile(path).mkString
-
-readContent: (path: String)String
-```
-
-2) Build a vocabulary with the following function:
-
-```scala
-scala> // Build  a vocabulary of what char we have in the content
-scala> def buildVocab(path: String): Map[String, Int] = {
-        val content = readContent(path).split("\n")
-        var idx = 1 // 0 is left for zero padding
-        var theVocab = Map[String, Int]()
-        for (line <- content) {
-         for (char <- line) {
-           val key = s"$char"
-           if (!theVocab.contains(key)) {
-             theVocab = theVocab + (key -> idx)
-             idx += 1
-           }
-         }
-        }
-        theVocab
-       }
-
-       buildVocab: (path: String)Map[String,Int]
-```
-
-3) To assign each character a unique numerical ID, use the following function:
-
-```scala
-scala> def text2Id(sentence: String, theVocab: Map[String, Int]): Array[Int] = {
-        val words = for (char <- sentence) yield theVocab(s"$char")
-        words.toArray
-      }
-
-      text2Id: (sentence: String, theVocab: Map[String,Int])Array[Int]
-```
-
-4) Now, build a character vocabulary from the dataset (obama.txt). Change the input filepath (dataPath) to reflect your settings.   
-
-```scala
-scala> // Give your system path to the "obama.txt" we have downloaded using previous steps.
-scala> val dataPath = "obama.txt"
-dataPath: String = obama.txt
-
-scala> val vocab = buildVocab(dataPath)
-
-scala> vocab.size
-res23: Int = 82
-```
-
-
-## Build a Multi-layer LSTM model
-
-Now, create a multi-layer LSTM model.
-
-To create the model:
-
-1) Load the helper files (`Lstm.scala`, `BucketIo.scala` and `RnnModel.scala`).
-`Lstm.scala` contains the definition of the LSTM cell. `BucketIo.scala` creates a sentence iterator. `RnnModel.scala` is used for model inference. The helper files are available on the [MXNet site](https://github.com/apache/incubator-mxnet/tree/master/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn).
-To load them, at the Scala command prompt type:
-
-```scala
-scala> :load ../../../scala-package/examples/src/main/scala/org/apache/mxnet/examples/rnn/Lstm.scala
-scala> :load ../../../scala-package/examples/src/main/scala/org/apache/mxnet/examples/rnn/BucketIo.scala
-scala> :load ../../../scala-package/examples/src/main/scala/org/apache/mxnet/examples/rnn/RnnModel.scala
-```
-
-2) Set the LSTM hyperparameters as follows:
-
-```scala
-scala> // We can support various input lengths.
-scala> // For this problem, we cut each input sentence to a length of 129 characters.
-scala> // So we only need a fixed length bucket length.
-scala> val buckets = Array(129)
-buckets: Array[Int] = Array(129)
-
-scala> // hidden unit in LSTM cell
-scala> val numHidden = 512
-numHidden: Int = 512
-
-scala> // The embedding dimension, which maps a char to a 256 dim vector
-scala> val numEmbed = 256
-numEmbed: Int = 256
-
-scala> // The number of lstm layers
-scala> val numLstmLayer = 3
-numLstmLayer: Int = 3
-
-scala> // The batch size for training
-scala> val batchSize = 32
-batchSize: Int = 32
-```
-
-3) Now, construct the LSTM network as a symbolic computation graph. Type the following to create a graph in which the model is unrolled for a fixed length explicitly in time.
-
-```scala
-scala> // generate symbol for a length
-scala> def symGen(seqLen: Int): Symbol = {
-    Lstm.lstmUnroll(numLstmLayer, seqLen, vocab.size + 1,
-                numHidden = numHidden, numEmbed = numEmbed,
-                numLabel = vocab.size + 1, dropout = 0.2f)
-  }
-symGen: (seqLen: Int)org.apache.mxnet.Symbol
-
-scala> // create the network symbol
-scala> val symbol = symGen(buckets(0))
-symbol: org.apache.mxnet.Symbol = org.apache.mxnet.Symbol@3a589eed
-
-```      
-
-4) To train the model, initialize states for the LSTM and create a data iterator, which groups the data into buckets.
-Note: The BucketSentenceIter data iterator supports various length examples; however, we use only the fixed length version in this tutorial.
-
-```scala
-
-scala> // initialize states for LSTM
-scala> val initC = for (l <- 0 until numLstmLayer) yield (s"l${l}_init_c", (batchSize, numHidden))
-
-initC: scala.collection.immutable.IndexedSeq[(String, (Int, Int))] = Vector((l0_init_c,(32,512)),
-(l1_init_c,(32,512)), (l2_init_c,(32,512)))
-
-scala> val initH = for (l <- 0 until numLstmLayer) yield (s"l${l}_init_h", (batchSize, numHidden))
-
-initH: scala.collection.immutable.IndexedSeq[(String, (Int, Int))] = Vector((l0_init_h,(32,512)),
-(l1_init_h,(32,512)), (l2_init_h,(32,512)))
-
-scala> val initStates = initC ++ initH
-
-initStates: scala.collection.immutable.IndexedSeq[(String, (Int, Int))] =
-Vector((l0_init_c,(32,512)), (l1_init_c,(32,512)), (l2_init_c,(32,512)), (l0_init_h,(32,512)),
-(l1_init_h,(32,512)), (l2_init_h,(32,512)))
-
-scala> val dataTrain = new BucketIo.BucketSentenceIter(dataPath, vocab, buckets,
-                                      batchSize, initStates, seperateChar = "\n",
-                                      text2Id = text2Id, readContent = readContent)
-
-dataTrain: BucketIo.BucketSentenceIter = non-empty iterator
-
-```
-
-5) You can set more than 100 epochs, but for this tutorial, specify 75 epochs. Each epoch can take as long as 4 minutes on a GPU. In this tutorial, you will use the [ADAM optimizer]({{'/api/scala/docs/api/#org.apache.mxnet.optimizer.Adam'|relative_url}}):
-```scala
-scala> import org.apache.mxnet._
-import org.apache.mxnet._
-
-scala> import org.apache.mxnet.Callback.Speedometer
-import org.apache.mxnet.Callback.Speedometer
-
-scala> import org.apache.mxnet.optimizer.Adam
-import org.apache.mxnet.optimizer.Adam
-
-scala> // and we will see result by training 75 epochs
-scala> val numEpoch = 75
-numEpoch: Int = 75
-
-scala> // learning rate
-scala> val learningRate = 0.001f
-learningRate: Float = 0.001
-
-```
-
-6) Define the perplexity utility function for the evaluation metric which is used to calculate the negative log-likelihood during training.
-
-```scala
-scala> def perplexity(label: NDArray, pred: NDArray): Float = {
-        val shape = label.shape
-        val size = shape(0) * shape(1)
-        val labelT = {
-          val tmp = label.toArray.grouped(shape(1)).toArray
-          val result = Array.fill[Float](size)(0f)
-          var idx = 0
-          for (i <- 0 until shape(1)) {
-            for (j <- 0 until shape(0)) {
-              result(idx) = tmp(j)(i)
-              idx += 1
-            }
-          }
-          result
-        }
-        var loss = 0f
-        val predArray = pred.toArray.grouped(pred.shape(1)).toArray
-        for (i <- 0 until pred.shape(0)) {
-          loss += -Math.log(Math.max(1e-10, predArray(i)(labelT(i).toInt)).toFloat).toFloat
-        }
-        loss / size
-        }
-
-perplexity: (label: org.apache.mxnet.NDArray, pred: org.apache.mxnet.NDArray)Float
-
-scala> def doCheckpoint(prefix: String): EpochEndCallback = new EpochEndCallback {
-            override def invoke(epoch: Int, symbol: Symbol,
-                                argParams: Map[String, NDArray],
-                                auxStates: Map[String, NDArray]): Unit = {
-              Model.saveCheckpoint(prefix, epoch + 1, symbol, argParams, auxStates)
-            }
-        }
-
-doCheckpoint: (prefix: String)org.apache.mxnet.EpochEndCallback
-
-```
-
-7) Define the initializer that is required for creating a model, as follows:
-
-```scala
-scala> val initializer = new Xavier(factorType = "in", magnitude = 2.34f)
-
-initializer: org.apache.mxnet.Xavier = org.apache.mxnet.Xavier@54e8f10a
-
-```
-
-8) Now, you have implemented all the supporting infrastructures for the char-lstm model. To train the model, use the standard [MXNet high-level API]({{'/api/scala/docs/api/#org.apache.mxnet.FeedForward'|relative_url}}). You can train the model on a single GPU or CPU from multiple GPUs or CPUs by changing ```scala .setContext(Array(Context.gpu(0),Context.gpu(1),Context.gpu(2),Context.gpu(3)))``` to ```scala .setContext(Array(Context.gpu(0)))```:
-```scala
-scala> val model = FeedForward.newBuilder(symbol)
-        .setContext(Array(Context.gpu(0),Context.gpu(1),Context.gpu(2),Context.gpu(3)))
-        .setNumEpoch(numEpoch)
-        .setOptimizer(new Adam(learningRate = learningRate, wd = 0.00001f))
-        .setInitializer(initializer)
-        .setTrainData(dataTrain)
-        .setEvalMetric(new CustomMetric(perplexity, name = "perplexity"))
-        .setBatchEndCallback(new Speedometer(batchSize, 20))
-        .setEpochEndCallback(doCheckpoint("obama"))
-        .build()
-
-model: org.apache.mxnet.FeedForward = org.apache.mxnet.FeedForward@4926f6c7
-```
-
-Now, you have an LSTM model and you've trained it. Use this model to create the inference.
-
-## Build the Inference Model
-
-You can now sample sentences from the trained model. The sampler works as follows:
-- Takes some fixed character set (e.g., "The United States") and feeds it into the LSTM as the starting input.
-- The LSTM produces an output distribution over the vocabulary and a state in the first time step then, samples a character from the output distribution and fixes it as the second character.
-- In the next time step, feeds the previously sampled character as input.
-- Continues running until it has sampled enough characters. Note we are running mini-batches, so several sentences could be sampled simultaneously.
-
-To build the inference model, define the following utility functions that help MXNet make inferences:
-
-* `makeRevertVocab` - Reverts the key value in the dictionary for easy access to characters while predicting
-* `makeInput` -  Uses a given character as input
-* `cdf`, `choice` - `cdf` is a helper function for the `choice` function, which is used to create random samples
-* `makeOutput` - Directs the model to use either random output or fixed output by choosing the option with the greatest probability.
-
-```scala
-scala> import scala.util.Random
-
-scala> // helper structure for prediction
-scala> def makeRevertVocab(vocab: Map[String, Int]): Map[Int, String] = {
-          var dic = Map[Int, String]()
-          vocab.foreach { case (k, v) =>
-            dic = dic + (v -> k)
-          }
-          dic
-        }
-
-makeRevertVocab: (vocab: Map[String,Int])Map[Int,String]
-
-scala> // make input from char
-scala> def makeInput(char: Char, vocab: Map[String, Int], arr: NDArray): Unit = {
-      val idx = vocab(s"$char")
-      val tmp = NDArray.zeros(1)
-      tmp.set(idx)
-      arr.set(tmp)
-    }
-
-makeInput: (char: Char, vocab: Map[String,Int], arr: org.apache.mxnet.NDArray)Unit
-
-scala> // helper function for random sample
-scala> def cdf(weights: Array[Float]): Array[Float] = {
-        val total = weights.sum
-        var result = Array[Float]()
-        var cumsum = 0f
-        for (w <- weights) {
-          cumsum += w
-          result = result :+ (cumsum / total)
-        }
-        result
-      }
-
-cdf: (weights: Array[Float])Array[Float]
-
-scala> def choice(population: Array[String], weights: Array[Float]): String = {
-      assert(population.length == weights.length)
-      val cdfVals = cdf(weights)
-      val x = Random.nextFloat()
-      var idx = 0
-      var found = false
-      for (i <- 0 until cdfVals.length) {
-        if (cdfVals(i) >= x && !found) {
-          idx = i
-          found = true
-        }
-      }
-      population(idx)
-    }
-
-choice: (population: Array[String], weights: Array[Float])String
-
-scala> // we can use random output or fixed output by choosing largest probability
-scala> def makeOutput(prob: Array[Float], vocab: Map[Int, String],
-                      sample: Boolean = false, temperature: Float = 1f): String = {
-         var idx = -1
-         val char = if (sample == false) {
-           idx = ((-1f, -1) /: prob.zipWithIndex) { (max, elem) =>
-             if (max._1 < elem._1) elem else max
-           }._2
-           if (vocab.contains(idx)) vocab(idx)
-           else ""
-         } else {
-           val fixDict = Array("") ++ (1 until vocab.size + 1).map(i => vocab(i))
-           var scaleProb = prob.map(x => if (x < 1e-6) 1e-6 else if (x > 1 - 1e-6) 1 - 1e-6 else x)
-           var rescale = scaleProb.map(x => Math.exp(Math.log(x) / temperature).toFloat)
-           val sum = rescale.sum.toFloat
-           rescale = rescale.map(_ / sum)
-           choice(fixDict, rescale)
-         }
-         char
-       }
-
-makeOutput: (prob: Array[Float], vocab: Map[Int,String], sample: Boolean, temperature: Float)String
-
-```
-
-1) Build the inference model:
-
-```scala
-scala> // load from check-point
-scala> val (_, argParams, _) = Model.loadCheckpoint("obama", 75)
-
-scala> // build an inference model
-scala> val model = new RnnModel.LSTMInferenceModel(numLstmLayer, vocab.size + 1, \
-                           numHidden = numHidden, numEmbed = numEmbed, \
-                           numLabel = vocab.size + 1, argParams = argParams, \
-                           ctx = Context.cpu(), dropout = 0.2f)
-
-model: RnnModel.LSTMInferenceModel = RnnModel$LSTMInferenceModel@2f0c0319
-```
-
-2) Now you can generate a sequence of 1200 characters (you can select any number of characters you want) starting with "The United States" as follows:
-
-```scala
-
-scala> val seqLength = 1200
-seqLength: Int = 1200
-
-scala> val inputNdarray = NDArray.zeros(1)
-inputNdarray: org.apache.mxnet.NDArray = org.apache.mxnet.NDArray@9c231a24
-
-scala> val revertVocab = makeRevertVocab(vocab)
-
-scala> // Feel free to change the starter sentence
-
-scala> var output = "The United States"
-output: String = The United States
-
-scala> val randomSample = true
-randomSample: Boolean = true
-
-scala> var newSentence = true
-newSentence: Boolean = true
-
-scala> val ignoreLength = output.length()
-ignoreLength: Int = 17
-
-scala> for (i <- 0 until seqLength) {
-        if (i <= ignoreLength - 1) makeInput(output(i), vocab, inputNdarray)
-        else makeInput(output.takeRight(1)(0), vocab, inputNdarray)
-        val prob = model.forward(inputNdarray, newSentence)
-        newSentence = false
-        val nextChar = makeOutput(prob, revertVocab, randomSample)
-        if (nextChar == "") newSentence = true
-        if (i >= ignoreLength) output = output ++ nextChar
-      }
-
-scala> output
-
-res7: String = The United States who have been blessed no companies would be proud that the challenges we face, it's not as directly untelle are in my daughters - you can afford -- life-saving march care and poor information and receiving battle against other speeces and lead its people. After champions of 2006, and because Africa in America, separate has been conferenced by children ation of discrimination, we remember all of this, succeeded in any other feelings of a palently better political process - at lliims being disability payment. All across all different mights of a more just a few global personal morality and industrialized ready to succeed.One can afford when the earliest days of a pension you can add to the system be confructive despair. They have starting in the demand for...
-
-```
-
-
-You can see the output generated from Obama's speeches. All of the line breaks, punctuation, and uppercase and lowercase letters were produced by the sampler (no post-processing was performed).
-
-
-## Next Steps
-* [Scala API]({{'/api/scala'|relative_url}})
-* [More Scala Examples](https://github.com/dmlc/mxnet/tree/master/scala-package/examples/)
-* [MXNet tutorials index]({{'/api'|relative_url}})
diff --git a/docs/static_site/src/pages/api/scala/docs/tutorials/mnist.md b/docs/static_site/src/pages/api/scala/docs/tutorials/mnist.md
deleted file mode 100644
index d121b41a6ef4..000000000000
--- a/docs/static_site/src/pages/api/scala/docs/tutorials/mnist.md
+++ /dev/null
@@ -1,141 +0,0 @@
----
-layout: page_api
-title: MNIST Example
-is_tutorial: true
-tag: scala
-permalink: /api/scala/docs/tutorials/mnist
----
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# Handwritten Digit Recognition
-
-This Scala tutorial guides you through a classic computer vision application: identifying hand written digits.
-
-Let's train a 3-layer network (i.e multilayer perceptron network) on the MNIST dataset to classify handwritten digits.
-
-## Prerequisites
-To complete this tutorial, we need:
-
-- to compile the latest MXNet version. See the MXNet installation instructions for your operating system in [Setup and Installation]({{'/get_started'|relative_url}}).
-- to compile the Scala API. See Scala API build instructions in [Build](https://github.com/dmlc/mxnet/tree/master/scala-package).
-
-## Define the Network
-
-First, define the neural network's architecture using the Symbol API:
-
-```scala
-import org.apache.mxnet._
-import org.apache.mxnet.optimizer.SGD
-
-// model definition
-val data = Symbol.Variable("data")
-val fc1 = Symbol.api.FullyConnected(Some(data), num_hidden = 128, name = "fc1")
-val act1 = Symbol.api.Activation(Some(fc1), "relu", "relu1")
-val fc2 = Symbol.api.FullyConnected(Some(act1), num_hidden = 64, name = "fc2")
-val act2 = Symbol.api.Activation(Some(fc2), "relu", "relu2")
-val fc3 = Symbol.api.FullyConnected(Some(act2), num_hidden = 10, name = "fc3")
-val mlp = Symbol.api.SoftmaxOutput(Some(fc3), name = "sm")
-```
-
-## Load the Data
-
-Then, load the training and validation data using DataIterators.
-
-You can download the MNIST data using the [get_mnist_data script](https://github.com/dmlc/mxnet/blob/master/scala-package/core/scripts/get_mnist_data.sh). We've already written a DataIterator for the MNIST dataset:
-
-```scala
-// load MNIST dataset
-val trainDataIter = IO.MNISTIter(Map(
-  "image" -> "data/train-images-idx3-ubyte",
-  "label" -> "data/train-labels-idx1-ubyte",
-  "data_shape" -> "(1, 28, 28)",
-  "label_name" -> "sm_label",
-  "batch_size" -> "50",
-  "shuffle" -> "1",
-  "flat" -> "0",
-  "silent" -> "0",
-  "seed" -> "10"))
-
-val valDataIter = IO.MNISTIter(Map(
-  "image" -> "data/t10k-images-idx3-ubyte",
-  "label" -> "data/t10k-labels-idx1-ubyte",
-  "data_shape" -> "(1, 28, 28)",
-  "label_name" -> "sm_label",
-  "batch_size" -> "50",
-  "shuffle" -> "1",
-  "flat" -> "0", "silent" -> "0"))
-```
-
-## Train the model
-
-We can use the FeedForward builder to train our network:
-
-```scala
-// setup model and fit the training data
-val model = FeedForward.newBuilder(mlp)
-      .setContext(Context.cpu())
-      .setNumEpoch(10)
-      .setOptimizer(new SGD(learningRate = 0.1f, momentum = 0.9f, wd = 0.0001f))
-      .setTrainData(trainDataIter)
-      .setEvalData(valDataIter)
-      .build()
-```
-
-## Make predictions
-
-Finally, let's make predictions against the validation dataset and compare the predicted labels with the real labels.
-
-```scala
-val probArrays = model.predict(valDataIter)
-// in this case, we do not have multiple outputs
-require(probArrays.length == 1)
-val prob = probArrays(0)
-
-// get real labels
-import scala.collection.mutable.ListBuffer
-valDataIter.reset()
-val labels = ListBuffer.empty[NDArray]
-while (valDataIter.hasNext) {
-  val evalData = valDataIter.next()
-  labels += evalData.label(0).copy()
-}
-val y = NDArray.concatenate(labels)
-
-// get predicted labels
-val predictedY = NDArray.argmax_channel(prob)
-require(y.shape == predictedY.shape)
-
-// calculate accuracy
-var numCorrect = 0
-var numTotal = 0
-for ((labelElem, predElem) <- y.toArray zip predictedY.toArray) {
-  if (labelElem == predElem) {
-    numCorrect += 1
-  }
-  numTotal += 1
-}
-val acc = numCorrect.toFloat / numTotal
-println(s"Final accuracy = $acc")
-```
-
-Check out more MXNet Scala examples below.
-
-## Next Steps
-* [Scala API]({{'/api/scala'|relative_url}})
-* [More Scala Examples](https://github.com/dmlc/mxnet/tree/master/scala-package/examples/)
-* [MXNet tutorials index]({{'/api'|relative_url}})
diff --git a/docs/static_site/src/pages/api/scala/docs/tutorials/model.md b/docs/static_site/src/pages/api/scala/docs/tutorials/model.md
deleted file mode 100644
index 06d2ee0c0eef..000000000000
--- a/docs/static_site/src/pages/api/scala/docs/tutorials/model.md
+++ /dev/null
@@ -1,142 +0,0 @@
----
-layout: page_api
-title: Model API *Deprecated*
-permalink: /api/scala/docs/tutorials/model
-is_tutorial: true
-tag: scala
----
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# MXNet Scala Model API
-
-The model API provides a simplified way to train neural networks using common best practices.
-It's a thin wrapper built on top of the [ndarray](ndarray) and [symbolic](symbol)
-modules that make neural network training easy.
-
-Topics:
-
-* [Train the Model](#train-the-model)
-* [Save the Model](#save-the-model)
-* [Periodic Checkpoint](#periodic-checkpointing)
-* [Multiple Devices](#use-multiple-devices)
-* [Model API Reference]({{'/api/scala/docs/api/#org.apache.mxnet.Model'|relative_url}})
-
-## Train the Model
-
-To train a model, perform two steps: configure the model using the symbol parameter,
-then call ```model.Feedforward.create``` to create the model.
-The following example creates a two-layer neural network.
-
-```scala
-    // configure a two layer neuralnetwork
-    val data = Symbol.Variable("data")
-    val fc1 = Symbol.api.FullyConnected(data, num_hidden = 128, name = "fc1")
-    val act1 = Symbol.api.Activation(Some(fc1), "relu", "relu1")
-    val fc2 = Symbol.api.FullyConnected(Some(act1), num_hidden = 64, name = "fc2")
-    val softmax = Symbol.api.SoftmaxOutput(Some(fc2), name = "sm")
-
-    // Construct the FeedForward model and fit on the input training data
-    val model = FeedForward.newBuilder(softmax)
-      .setContext(Context.cpu())
-      .setNumEpoch(num_epoch)
-      .setOptimizer(new SGD(learningRate = 0.01f, momentum = 0.9f, wd = 0.0001f))
-      .setTrainData(trainDataIter)
-      .setEvalData(valDataIter)
-      .build()
-```
-You can also use the `scikit-learn-style` construct and `fit` function to create a model.
-
-```scala
-    // create a model using sklearn-style two-step way
-    val model = new FeedForward(softmax,
-                                numEpoch = numEpochs,
-                                argParams = argParams,
-                                auxParams = auxParams,
-                                beginEpoch = beginEpoch,
-                                epochSize = epochSize)
-
-  model.fit(trainData = train)
-```
-For more information, see [API Reference]({{'/api/scala/docs/api/#package'|relative_url}}).
-
-## Save the Model
-
-After the job is done, save your work.
-We also provide `save` and `load` functions. You can use the `load` function to load a model checkpoint from a file.
-
-```scala
-    // checkpoint the model data into file,
-    // save a model to modelPrefix-symbol.json and modelPrefix-0100.params
-    val modelPrefix: String = "checkpt"
-    val num_epoch = 100
-    Model.saveCheckpoint(modelPrefix, epoch + 1, symbol, argParams, auxStates)
-
-    // load model back
-    val model_loaded = FeedForward.load(modelPrefix, num_epoch)
-```
-The advantage of these two `save` and `load` functions is that they are language agnostic.
-You should be able to save and load directly into cloud storage, such as Amazon S3 and HDFS.
-
-##  Periodic Checkpointing
-
-We recommend checkpointing your model after each iteration.
-To do this, use ```EpochEndCallback``` to add a ```Model.saveCheckpoint(<parameters>)``` checkpoint callback to the function after each iteration .
-
-```scala
-    // modelPrefix-symbol.json will be saved for symbol.
-    // modelPrefix-epoch.params will be saved for parameters.
-    // Checkpoint the model into file. Can specify parameters.
-    // For more information, check API doc.
-    val modelPrefix: String = "checkpt"
-    val checkpoint: EpochEndCallback =
-    if (modelPrefix == null) null
-    else new EpochEndCallback {
-      override def invoke(epoch: Int, symbol: Symbol,
-                         argParams: Map[String, NDArray],
-                         auxStates: Map[String, NDArray]): Unit = {
-       Model.saveCheckpoint(modelPrefix, epoch + 1, symbol, argParams, auxParams)
-            }
-           }
-
-    // Load model checkpoint from file. Returns symbol, argParams, auxParams.
-    val (_, argParams, _) = Model.loadCheckpoint(modelPrefix, num_epoch)
-
-```
-You can load the model checkpoint later using ```Model.loadCheckpoint(modelPrefix, num_epoch)```.
-
-## Use Multiple Devices
-
-Set ```ctx``` to the list of devices that you want to train on. You can create a list of devices in any way you want.
-
-```scala
-    val devices = Array(Context.gpu(0), Context.gpu(1))
-
-    val model = new FeedForward(ctx = devices,
-             symbol = network,
-             numEpoch = numEpochs,
-             optimizer = optimizer,
-             epochSize = epochSize,
-             ...)
-```
-Training occurs in parallel on the GPUs that you specify.
-
-## Next Steps
-* See [Symbolic API](symbol) for operations on NDArrays that assemble neural networks from layers.
-* See [IO Data Loading API](io) for parsing and loading data.
-* See [NDArray API](ndarray) for vector/matrix/tensor operations.
-* See [KVStore API](kvstore) for multi-GPU and multi-host distributed training.
\ No newline at end of file
diff --git a/docs/static_site/src/pages/api/scala/docs/tutorials/module.md b/docs/static_site/src/pages/api/scala/docs/tutorials/module.md
deleted file mode 100644
index 936e066cea43..000000000000
--- a/docs/static_site/src/pages/api/scala/docs/tutorials/module.md
+++ /dev/null
@@ -1,162 +0,0 @@
----
-layout: page_api
-title: Module API
-permalink: /api/scala/docs/tutorials/module
-is_tutorial: true
-tag: scala
----
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# Module API
-The module API provides an intermediate and high-level interface for performing computation with neural networks in MXNet. A *module* is an instance of subclasses of the `BaseModule`. The most widely used module class is called `Module`. Module wraps a `Symbol` and one or more `Executors`. For a full list of functions, see `BaseModule`.
-A subclass of modules might have extra interface functions. This topic provides some examples of common use cases. All of the module APIs are in the `Module` namespace.
-
-## Preparing a Module for Computation
-
-To construct a module, refer to the constructors for the module class. For example, the `Module` class accepts a `Symbol` as input:
-
-```scala
-    import org.apache.mxnet._
-    import org.apache.mxnet.module.{FitParams, Module}
-
-    // construct a simple MLP
-    val data = Symbol.Variable("data")
-    val fc1 = Symbol.api.FullyConnected(Some(data), num_hidden = 128, name = "fc1")
-    val act1 = Symbol.api.Activation(Some(fc1), "relu", "relu1")
-    val fc2 = Symbol.api.FullyConnected(Some(act1), num_hidden = 64, name = "fc2")
-    val act2 = Symbol.api.Activation(Some(fc2), "relu", "relu2")
-    val fc3 = Symbol.api.FullyConnected(Some(act2), num_hidden = 10, name = "fc3")
-    val out = Symbol.api.SoftmaxOutput(fc3, name = "softmax")
-
-    // construct the module
-    val mod = new Module(out)
-```
-
-By default, `context` is the CPU. If you need data parallelization, you can specify a GPU context or an array of GPU contexts.
-
-Before you can compute with a module, you need to call `bind()` to allocate the device memory and `initParams()` or `SetParams()` to initialize the parameters.
-If you simply want to fit a module, you don't need to call `bind()` and `initParams()` explicitly, because the fit() function automatically calls them if they are needed.
-
-```scala
-    mod.bind(dataShapes = train_dataiter.provideData, labelShapes = Some(train_dataiter.provideLabel))
-    mod.initParams()
-```
-
-Now you can compute with the module using functions like `forward()`, `backward()`, etc.
-
-## Training, Predicting, and Evaluating
-
-Modules provide high-level APIs for training, predicting, and evaluating. To fit a module, call the `fit()` function with some `DataIter`s:
-
-```scala
-    import org.apache.mxnet.optimizer.SGD
-    val mod = new Module(softmax)
-
-    mod.fit(train_dataiter, evalData = scala.Option(eval_dataiter), \
-    numEpoch = n_epoch, fitParams = new FitParams()\
-    .setOptimizer(new SGD(learningRate = 0.1f, momentum = 0.9f, wd = 0.0001f)))
-```
-
-The interface is very similar to the old `FeedForward` class. You can pass in batch-end callbacks using `setBatchEndCallback` and epoch-end callbacks using `setEpochEndCallback`. You can also set parameters using methods like `setOptimizer` and `setEvalMetric`. To learn more about the `FitParams()`, see the [API page]({{'/api/scala/docs/api/#org.apache.mxnet.module.FitParams'|relative_url}}). To predict with a module, call `predict()` with a `DataIter`:
-
-```scala
-    mod.predict(val_dataiter)
-```
-
-The module collects and returns all of the prediction results. For more details about the format of the return values, see the documentation for the [`predict()` function]({{'/api/scala/docs/api/#org.apache.mxnet.module.BaseModule'|relative_url}}).
-
-When prediction results might be too large to fit in memory, use the `predictEveryBatch` API:
-
-```scala
-    val preds = mod.predictEveryBatch(val_dataiter)
-    val_dataiter.reset()
-    var i = 0
-    while (val_dataiter.hasNext) {
-       val batch = val_dataiter.next()
-       val predLabel: Array[Int] = NDArray.argmax_channel(preds(i)(0)).toArray.map(_.toInt)
-       val label = batch.label(0).toArray.map(_.toInt)
-       //do something...
-       i += 1
-    }
-```
-
-If you need to evaluate on a test set and don't need the prediction output, call the `score()` function with a `DataIter` and an `EvalMetric`:
-
-```scala
-    mod.score(val_dataiter, metric)
-```
-
-This runs predictions on each batch in the provided `DataIter` and computes the evaluation score using the provided `EvalMetric`. The evaluation results are stored in `metric` so that you can query later.
-
-## Saving and Loading Module Parameters
-
-To save the module parameters in each training epoch, use a `checkpoint` callback:
-
-```scala
-    val modelPrefix: String = "mymodel"
-
-    for (epoch <- 0 until 5) {
-      while(train_dataiter.hasNext){
-          // forward backward pass
-         //do something...
-       }
-        val checkpoint = mod.saveCheckpoint(modelPrefix, epoch, saveOptStates = true)
-
-    }
-```
-
-To load the saved module parameters, call the `loadCheckpoint` function:
-
-```scala
-    val mod = Module.loadCheckpoint(modelPrefix, loadModelEpoch, loadOptimizerStates = true)
-```
-
-To initialize parameters, Bind the symbols to construct executors first with `bind` method. Then, initialize the parameters and auxiliary states by calling `initParams()` method.
-
-```scala
-    mod.bind(dataShapes = train_dataiter.provideData, labelShapes = Some(train_dataiter.provideLabel))
-    mod.initParams()
-```
-
-To get current parameters, use `getParams` method.
-
-```scala
-    val (argParams, auxParams) = mod.getParams
-```
-
-To assign parameter and aux state values, use `setParams` method.
-
-```scala
-    mod.setParams(argParams, auxParams)
-```
-
-To resume training from a saved checkpoint, instead of calling `setParams()`, directly call `fit()`, passing the loaded parameters, so that `fit()` knows to start from those parameters instead of initializing randomly:
-
-```scala
-    mod.fit(..., fitParams=new FitParams().setArgParams(argParams).\
-    setAuxParams(auxParams).setBeginEpoch(beginEpoch))
-```
-
-Create an object of the `FitParams()` class, and then use it to call the `setBeginEpoch()` method to pass `beginEpoch` so that `fit()` knows to resume from a saved epoch.
-
-## Next Steps
-* See [Model API](model) for an alternative simple high-level interface for training neural networks.
-* See [Symbolic API](symbol) for operations on NDArrays that assemble neural networks from layers.
-* See [IO Data Loading API](io) for parsing and loading data.
-* See [NDArray API](ndarray) for vector/matrix/tensor operations.
-* See [KVStore API](kvstore) for multi-GPU and multi-host distributed training.
diff --git a/docs/static_site/src/pages/api/scala/docs/tutorials/symbol.md b/docs/static_site/src/pages/api/scala/docs/tutorials/symbol.md
index 6bb8a471cef7..8a8f93f6eecd 100644
--- a/docs/static_site/src/pages/api/scala/docs/tutorials/symbol.md
+++ b/docs/static_site/src/pages/api/scala/docs/tutorials/symbol.md
@@ -41,19 +41,6 @@ We also highly encourage you to read [Symbolic Configuration and Execution in Pi
 The symbolic API provides a way to configure computation graphs.
 You can configure the graphs either at the level of neural network layer operations or as fine-grained operations.
 
-The following example configures a two-layer neural network.
-
-```scala
-    import org.apache.mxnet._
-    val data = Symbol.Variable("data")
-    val fc1 = Symbol.api.FullyConnected(Some(data), num_hidden = 128, name = "fc1")
-    val act1 = Symbol.api.Activation(Some(fc1), "relu", "relu1")
-    val fc2 = Symbol.api.FullyConnected(Some(act1), num_hidden = 64, name = "fc2")
-    val net = Symbol.api.SoftmaxOutput(Some(fc2), name = "out")
-    :type net
-    // org.apache.mxnet.Symbol
-```
-
 The basic arithmetic operators (plus, minus, div, multiplication) are overloaded for
 *element-wise operations* of symbols.
 
@@ -130,25 +117,6 @@ input data, and the weights of the neural network that were learned during train
 To manually execute a set of symbols, you need to create an [`Executor`] object,
 which is typically constructed by calling the [`simpleBind(<parameters>)`] method on a symbol.
 
-## Multiple Outputs
-
-To group the symbols together, use the [mxnet.symbol.Group](#mxnet.symbol.Group) function.
-
-```scala
-    import org.apache.mxnet._
-    val data = Symbol.Variable("data")
-    val fc1 = Symbol.api.FullyConnected(Some(data), num_hidden = 128, name = "fc1")
-    val act1 = Symbol.api.Activation(Some(fc1), "relu", "relu1")
-    val fc2 = Symbol.api.FullyConnected(Some(act1), num_hidden = 64, name = "fc2")
-    val net = Symbol.api.SoftmaxOutput(Some(fc2), name = "out")
-    val group = Symbol.Group(fc1, net)
-    group.listOutputs()
-    // IndexedSeq[String] = ArrayBuffer(fc1_output, out_output)
-```
-
-After you get the ```group```, you can bind on ```group``` instead.
-The resulting executor will have two outputs, one for fc1_output and one for softmax_output.
-
 ## Next Steps
 * See [IO Data Loading API](io) for parsing and loading data.
 * See [NDArray API](ndarray) for vector/matrix/tensor operations.
diff --git a/example/automatic-mixed-precision/README.md b/example/automatic-mixed-precision/README.md
index 49147cd87242..334828ab1cce 100644
--- a/example/automatic-mixed-precision/README.md
+++ b/example/automatic-mixed-precision/README.md
@@ -22,14 +22,8 @@ This folder contains examples for converting FP32 models to mixed precision mode
 
 ## Basic Usages
 
-1. AMP Model Conversion for a gluon model, casting the params wherever possible to FP16. The below script will convert the `resnet101_v1` model to Mixed Precision Model and cast params to FP16 wherever possible, load this converted model and run inference on it.
+AMP Model Conversion for a gluon model, casting the params wherever possible to FP16. The below script will convert the `resnet101_v1` model to Mixed Precision Model and cast params to FP16 wherever possible, load this converted model and run inference on it.
 
 ```bash
-python amp_model_conversion.py --model resnet101_v1 --use-gluon-model  --run-dummy-inference --cast-optional-params
-```
-
-2. AMP Model Conversion for a symbolic model, keeping the params in FP32 wherever possible (--cast-optional-params not used).
-
-```bash
-python amp_model_conversion.py --model imagenet1k-resnet-152  --run-dummy-inference
+python amp_model_conversion.py --model resnet101_v1  --run-dummy-inference --cast-optional-params
 ```
diff --git a/example/automatic-mixed-precision/amp_model_conversion.py b/example/automatic-mixed-precision/amp_model_conversion.py
index b363e0244a10..d0e625b0adf8 100644
--- a/example/automatic-mixed-precision/amp_model_conversion.py
+++ b/example/automatic-mixed-precision/amp_model_conversion.py
@@ -25,13 +25,6 @@
 from mxnet.contrib.amp import amp
 import numpy as np
 
-def download_model(model_name, logger=None):
-    dir_path = os.path.dirname(os.path.realpath(__file__))
-    model_path = os.path.join(dir_path, 'model')
-    if logger is not None:
-        logger.info('Downloading model {}... into path {}'.format(model_name, model_path))
-    return modelzoo.download_model(args.model, os.path.join(dir_path, 'model'))
-
 
 def save_symbol(fname, sym, logger=None):
     if logger is not None:
@@ -48,16 +41,6 @@ def save_params(fname, arg_params, aux_params, logger=None):
 
 
 if __name__ == '__main__':
-    symbolic_models = ['imagenet1k-resnet-152',
-                       'imagenet1k-resnet-18',
-                       'imagenet1k-resnet-34',
-                       'imagenet1k-resnet-50',
-                       'imagenet1k-resnet-101',
-                       'imagenet1k-resnext-50',
-                       'imagenet1k-resnext-101',
-                       'imagenet1k-resnext-101-64x4d',
-                       'imagenet11k-place365ch-resnet-152',
-                       'imagenet11k-place365ch-resnet-50']
     # Faster RCNN and Mask RCNN commented because of model loading issues
     # https://github.com/dmlc/gluon-cv/issues/1034
     gluon_models = [#'faster_rcnn_fpn_resnet50_v1b_coco',
@@ -181,16 +164,13 @@ def save_params(fname, arg_params, aux_params, logger=None):
     calib_inception_models = ["inceptionv3"]
     gluon_models = gluon_models + segmentation_models + \
                    calib_ssd_models + calib_inception_models
-    models = symbolic_models + gluon_models
+    models = gluon_models
 
     parser = argparse.ArgumentParser(description='Convert a provided FP32 model to a mixed precision model')
     parser.add_argument('--model', type=str, choices=models)
     parser.add_argument('--run-dummy-inference', action='store_true', default=False,
                         help='Will generate random input of shape (1, 3, 224, 224) '
                              'and run a dummy inference forward pass')
-    parser.add_argument('--use-gluon-model', action='store_true', default=False,
-                        help='If enabled, will download pretrained model from Gluon-CV '
-                             'and convert to mixed precision model ')
     parser.add_argument('--cast-optional-params', action='store_true', default=False,
                         help='If enabled, will try to cast params to target dtype wherever possible')
     args = parser.parse_args()
@@ -198,47 +178,24 @@ def save_params(fname, arg_params, aux_params, logger=None):
     logger = logging.getLogger('logger')
     logger.setLevel(logging.INFO)
 
-    if not args.use_gluon_model:
-        assert args.model in symbolic_models, "Please choose one of the available symbolic models: {} \
-                                               If you want to use gluon use the script with --use-gluon-model".format(symbolic_models)
-
-        prefix, epoch = download_model(model_name=args.model, logger=logger)
-        sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
-        result_sym, result_arg_params, result_aux_params = amp.convert_model(sym, arg_params, aux_params,
-                                                                             cast_optional_params=args.cast_optional_params)
-        sym_name = "%s-amp-symbol.json" % (prefix)
-        save_symbol(sym_name, result_sym, logger)
-        param_name = '%s-%04d.params' % (prefix + '-amp', epoch)
-        save_params(param_name, result_arg_params, result_aux_params, logger)
-        if args.run_dummy_inference:
-            logger.info("Running inference on the mixed precision model with dummy input, batch size: 1")
-            mod = mx.mod.Module(result_sym, data_names=['data'], label_names=['softmax_label'], context=mx.gpu(0))
-            mod.bind(data_shapes=[['data', (1, 3, 224, 224)]], label_shapes=[['softmax_label', (1,)]])
-            mod.set_params(arg_params, aux_params)
-            mod.forward(mx.io.DataBatch(data=[mx.nd.ones((1, 3, 224, 224))],
-                                        label=[mx.nd.ones((1,))]))
-            result = mod.get_outputs()[0].asnumpy()
-            logger.info("Inference run successfully")
+    assert args.model in gluon_models, "Please choose one of the available gluon models: {}".format(gluon_models)
+    shape = None
+    if args.model in segmentation_models:
+        shape = (1, 3, 480, 480)
+    elif args.model in calib_ssd_models:
+        shape = (1, 3, 512, 544)
+    elif args.model in calib_inception_models:
+        shape = (1, 3, 299, 299)
     else:
-        assert args.model in gluon_models, "Please choose one of the available gluon models: {} \
-                                            If you want to use symbolic model instead, remove --use-gluon-model when running the script".format(gluon_models)
-        shape = None
-        if args.model in segmentation_models:
-            shape = (1, 3, 480, 480)
-        elif args.model in calib_ssd_models:
-            shape = (1, 3, 512, 544)
-        elif args.model in calib_inception_models:
-            shape = (1, 3, 299, 299)
-        else:
-            shape = (1, 3, 224, 224)
-        net = gluoncv.model_zoo.get_model(args.model, pretrained=True)
-        net.hybridize()
-        result_before1 = net.forward(mx.nd.random.uniform(shape=shape))
-        net.export("{}".format(args.model))
-        net = amp.convert_hybrid_block(net, cast_optional_params=args.cast_optional_params)
-        net.export("{}-amp".format(args.model), remove_amp_cast=False)
-        if args.run_dummy_inference:
-            logger.info("Running inference on the mixed precision model with dummy inputs, batch size: 1")
-            result_after = net.forward(mx.nd.random.uniform(shape=shape, dtype=np.float32, ctx=mx.gpu(0)))
-            result_after = net.forward(mx.nd.random.uniform(shape=shape, dtype=np.float32, ctx=mx.gpu(0)))
-            logger.info("Inference run successfully")
+        shape = (1, 3, 224, 224)
+    net = gluoncv.model_zoo.get_model(args.model, pretrained=True)
+    net.hybridize()
+    result_before1 = net.forward(mx.nd.random.uniform(shape=shape))
+    net.export("{}".format(args.model))
+    net = amp.convert_hybrid_block(net, cast_optional_params=args.cast_optional_params)
+    net.export("{}-amp".format(args.model), remove_amp_cast=False)
+    if args.run_dummy_inference:
+        logger.info("Running inference on the mixed precision model with dummy inputs, batch size: 1")
+        result_after = net.forward(mx.nd.random.uniform(shape=shape, dtype=np.float32, ctx=mx.gpu(0)))
+        result_after = net.forward(mx.nd.random.uniform(shape=shape, dtype=np.float32, ctx=mx.gpu(0)))
+        logger.info("Inference run successfully")
diff --git a/example/captcha/README.md b/example/captcha/README.md
deleted file mode 100644
index 7743997e90bd..000000000000
--- a/example/captcha/README.md
+++ /dev/null
@@ -1,22 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-This is the R version of [captcha recognition](http://blog.xlvector.net/2016-05/mxnet-ocr-cnn/) example by xlvector and it can be used as an example of multi-label training. For a captcha below, we consider it as an image with 4 labels and train a CNN over the data set.
-
-![](captcha_example.png)
-
-You can download the images and `.rec` files from [here](https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/data/captcha_example.zip). Since each image has 4 labels, please remember to use `label_width=4` when generating the `.rec` files.
diff --git a/example/captcha/captcha_example.png b/example/captcha/captcha_example.png
deleted file mode 100644
index 09b84f7190fa..000000000000
Binary files a/example/captcha/captcha_example.png and /dev/null differ
diff --git a/example/captcha/mxnet_captcha.R b/example/captcha/mxnet_captcha.R
deleted file mode 100644
index 43e819f8c264..000000000000
--- a/example/captcha/mxnet_captcha.R
+++ /dev/null
@@ -1,85 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-library(mxnet)
-
-data <- mx.symbol.Variable('data')
-label <- mx.symbol.Variable('label')
-conv1 <- mx.symbol.Convolution(data = data, kernel = c(5, 5), num_filter = 32)
-pool1 <- mx.symbol.Pooling(data = conv1, pool_type = "max", kernel = c(2, 2), stride = c(1, 1))
-relu1 <- mx.symbol.Activation(data = pool1, act_type = "relu")
-
-conv2 <- mx.symbol.Convolution(data = relu1, kernel = c(5, 5), num_filter = 32)
-pool2 <- mx.symbol.Pooling(data = conv2, pool_type = "avg", kernel = c(2, 2), stride = c(1, 1))
-relu2 <- mx.symbol.Activation(data = pool2, act_type = "relu")
-
-flatten <- mx.symbol.Flatten(data = relu2)
-fc1 <- mx.symbol.FullyConnected(data = flatten, num_hidden = 120)
-fc21 <- mx.symbol.FullyConnected(data = fc1, num_hidden = 10)
-fc22 <- mx.symbol.FullyConnected(data = fc1, num_hidden = 10)
-fc23 <- mx.symbol.FullyConnected(data = fc1, num_hidden = 10)
-fc24 <- mx.symbol.FullyConnected(data = fc1, num_hidden = 10)
-fc2 <- mx.symbol.Concat(c(fc21, fc22, fc23, fc24), dim = 0, num.args = 4)
-label <- mx.symbol.transpose(data = label)
-label <- mx.symbol.Reshape(data = label, target_shape = c(0))
-captcha_net <- mx.symbol.SoftmaxOutput(data = fc2, label = label, name = "softmax")
-
-mx.metric.acc2 <- mx.metric.custom("accuracy", function(label, pred) {
-    ypred <- max.col(t(data.matrix(pred))) - 1
-    ypred <- matrix(ypred, nrow = nrow(label), ncol = ncol(label), byrow = TRUE)
-    return(sum(colSums(data.matrix(label) == ypred) == 4) / ncol(label))
-  })
-
-data.shape <- c(80, 30, 3)
-
-batch_size <- 40
-
-train <- mx.io.ImageRecordIter(
-  path.imgrec     = "captcha_train.rec",
-  path.imglist    = "captcha_train.lst",
-  batch.size      = batch_size,
-  label.width     = 4,
-  data.shape      = data.shape,
-  mean.img        = "mean.bin"
-)
-
-val <- mx.io.ImageRecordIter(
-  path.imgrec     = "captcha_test.rec",
-  path.imglist    = "captcha_test.lst",
-  batch.size      = batch_size,
-  label.width     = 4,
-  data.shape      = data.shape,
-  mean.img        = "mean.bin"
-)
-
-mx.set.seed(42)
-
-model <- mx.model.FeedForward.create(
-  X                  = train,
-  eval.data          = val,
-  ctx                = mx.gpu(),
-  symbol             = captcha_net,
-  eval.metric        = mx.metric.acc2,
-  num.round          = 10,
-  learning.rate      = 0.0001,
-  momentum           = 0.9,
-  wd                 = 0.00001,
-  batch.end.callback = mx.callback.log.train.metric(50),
-  initializer        = mx.init.Xavier(factor_type = "in", magnitude = 2.34),
-  optimizer          = "sgd",
-  clip_gradient = 10
-)
diff --git a/example/distributed_training-horovod/README.md b/example/distributed_training-horovod/README.md
index b2b4ad754dd2..1432cc4af500 100644
--- a/example/distributed_training-horovod/README.md
+++ b/example/distributed_training-horovod/README.md
@@ -71,7 +71,7 @@ To run MXNet with Horovod, make the following additions to your training script:
 3. Scale the learning rate by number of workers. Effective batch size in synchronous distributed training is scaled by
     the number of workers. An increase in learning rate compensates for the increased batch size.
 
-4. Create `hvd.DistributedTrainer` with optimizer when using Gluon API or wrap optimizer in `hvd.DistributedOptimizer` when using Module API.  The distributed trainer or optimizer delegates gradient computation
+4. Create `hvd.DistributedTrainer` with optimizer when using Gluon API.  The distributed trainer or optimizer delegates gradient computation
     to the original optimizer, averages gradients using *allreduce*, and then applies those averaged
     gradients.
 
@@ -134,50 +134,6 @@ for epoch in range(num_epoch):
         trainer.step(batch_size)
 ```
 
-## Module API
-```python
-import mxnet as mx
-import horovod.mxnet as hvd
-
-# Initialize Horovod
-hvd.init()
-
-# Set context to current process
-context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu(hvd.local_rank())
-num_workers = hvd.size()
-
-# Build model
-model = ...
-
-# Define hyper parameters
-optimizer_params = ...
-
-# Add Horovod Distributed Optimizer
-opt = mx.optimizer.create('sgd', **optimizer_params)
-opt = hvd.DistributedOptimizer(opt)
-
-# Initialize parameters
-initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in",
-                             magnitude=2)
-model.bind(data_shapes=train_data.provide_data,
-           label_shapes=train_data.provide_label)
-model.init_params(initializer)
-
-# Fetch and broadcast parameters
-(arg_params, aux_params) = model.get_params()
-if arg_params:
-    hvd.broadcast_parameters(arg_params, root_rank=0)
-if aux_params:
-    hvd.broadcast_parameters(aux_params, root_rank=0)
-model.set_params(arg_params=arg_params, aux_params=aux_params)
-
-# Train model
-model.fit(train_data,
-          kvstore=None,
-          optimizer=opt,
-          num_epoch=num_epoch)
-```
-
 
 # Running Horovod
 
diff --git a/example/distributed_training-horovod/module_mnist.py b/example/distributed_training-horovod/module_mnist.py
deleted file mode 100644
index 74f6bc9daf21..000000000000
--- a/example/distributed_training-horovod/module_mnist.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import logging
-import os
-import zipfile
-
-import horovod.mxnet as hvd
-import mxnet as mx
-from mxnet.test_utils import download
-
-# Training settings
-parser = argparse.ArgumentParser(description='MXNet MNIST Example')
-parser.add_argument('--batch-size', type=int, default=64,
-                    help='training batch size (default: 64)')
-parser.add_argument('--dtype', type=str, default='float32',
-                    help='training data type (default: float32)')
-parser.add_argument('--epochs', type=int, default=5,
-                    help='number of training epochs (default: 5)')
-parser.add_argument('--lr', type=float, default=0.05,
-                    help='learning rate (default: 0.05)')
-parser.add_argument('--momentum', type=float, default=0.5,
-                    help='SGD momentum (default: 0.5)')
-parser.add_argument('--no-cuda', action='store_true', default=False,
-                    help='disables CUDA training (default: False)')
-args = parser.parse_args()
-
-if not args.no_cuda:
-    # Disable CUDA if there are no GPUs.
-    if mx.context.num_gpus() == 0:
-        args.no_cuda = True
-
-logging.basicConfig(level=logging.INFO)
-logging.info(args)
-
-
-# Function to get mnist iterator given a rank
-def get_mnist_iterator(rank):
-    data_dir = "data-%d" % rank
-    if not os.path.isdir(data_dir):
-        os.makedirs(data_dir)
-    zip_file_path = download('http://data.mxnet.io/mxnet/data/mnist.zip',
-                             dirname=data_dir)
-    with zipfile.ZipFile(zip_file_path) as zf:
-        zf.extractall(data_dir)
-
-    input_shape = (1, 28, 28)
-    batch_size = args.batch_size
-
-    train_iter = mx.io.MNISTIter(
-        image="%s/train-images-idx3-ubyte" % data_dir,
-        label="%s/train-labels-idx1-ubyte" % data_dir,
-        input_shape=input_shape,
-        batch_size=batch_size,
-        shuffle=True,
-        flat=False,
-        num_parts=hvd.size(),
-        part_index=hvd.rank()
-    )
-
-    val_iter = mx.io.MNISTIter(
-        image="%s/t10k-images-idx3-ubyte" % data_dir,
-        label="%s/t10k-labels-idx1-ubyte" % data_dir,
-        input_shape=input_shape,
-        batch_size=batch_size,
-        flat=False,
-        num_parts=hvd.size(),
-        part_index=hvd.rank()
-    )
-
-    return train_iter, val_iter
-
-# Step 1: initialize Horovod
-hvd.init()
-
-# Horovod: pin context to process
-context = mx.cpu(hvd.local_rank()) if args.no_cuda else mx.gpu(hvd.local_rank())
-
-# Step 2: load data
-train_iter, val_iter = get_mnist_iterator(hvd.rank())
-
-# Step 3: define network
-def conv_net():
-    # placeholder for data
-    data = mx.sym.var('data')
-    # first conv layer
-    conv1 = mx.sym.Convolution(data=data, kernel=(5, 5), num_filter=10)
-    relu1 = mx.sym.Activation(data=conv1, act_type='relu')
-    pool1 = mx.sym.Pooling(data=relu1, pool_type='max', kernel=(2, 2),
-                           stride=(2, 2))
-    # second conv layer
-    conv2 = mx.sym.Convolution(data=pool1, kernel=(5, 5), num_filter=20)
-    relu2 = mx.sym.Activation(data=conv2, act_type='relu')
-    pool2 = mx.sym.Pooling(data=relu2, pool_type='max', kernel=(2, 2),
-                           stride=(2, 2))
-    # first fully connected layer
-    flatten = mx.sym.flatten(data=pool2)
-    fc1 = mx.symbol.FullyConnected(data=flatten, num_hidden=50)
-    relu3 = mx.sym.Activation(data=fc1, act_type='relu')
-    # second fully connected layer
-    fc2 = mx.sym.FullyConnected(data=relu3, num_hidden=10)
-    # softmax loss
-    loss = mx.sym.SoftmaxOutput(data=fc2, name='softmax')
-    return loss
-
-net = conv_net()
-model = mx.mod.Module(symbol=net, context=context)
-
-# Step 4: initialize parameters
-initializer = mx.init.Xavier(rnd_type='gaussian', factor_type="in",
-                             magnitude=2)
-model.bind(data_shapes=train_iter.provide_data,
-           label_shapes=train_iter.provide_label)
-model.init_params(initializer)
-
-# Horovod: fetch and broadcast parameters
-(arg_params, aux_params) = model.get_params()
-if arg_params is not None:
-    hvd.broadcast_parameters(arg_params, root_rank=0)
-if aux_params is not None:
-    hvd.broadcast_parameters(aux_params, root_rank=0)
-model.set_params(arg_params=arg_params, aux_params=aux_params)
-
-# Step 5: create optimizer
-optimizer_params = {'learning_rate': args.lr * hvd.size(),
-                    'rescale_grad': 1.0 / args.batch_size}
-opt = mx.optimizer.create('sgd', **optimizer_params)
-
-# Horovod: wrap optimizer with DistributedOptimizer
-opt = hvd.DistributedOptimizer(opt)
-
-# Step 6: fit and train model
-batch_cb = None
-if hvd.rank() == 0:
-    batch_cb = mx.callback.Speedometer(args.batch_size * hvd.size())
-model.fit(train_iter,  # train data
-          kvstore=None,  # no kvstore
-          eval_data=val_iter,  # validation data
-          optimizer=opt,  # use SGD to train
-          eval_metric='acc',  # report accuracy during training
-          batch_end_callback=batch_cb,  # report training speed
-          num_epoch=args.epochs)  # train for at most 10 dataset passes
-
-# Step 7: evaluate model accuracy
-acc = mx.gluon.metric.Accuracy()
-model.score(val_iter, acc)
-
-if hvd.rank() == 0:
-    print(acc)
-    assert acc.get()[1] > 0.96, "Achieved accuracy (%f) is lower than \
-                                expected (0.96)" % acc.get()[1]
diff --git a/example/distributed_training-horovod/resnet50_imagenet.py b/example/distributed_training-horovod/resnet50_imagenet.py
index ae8a56100929..cdf17a8912e0 100644
--- a/example/distributed_training-horovod/resnet50_imagenet.py
+++ b/example/distributed_training-horovod/resnet50_imagenet.py
@@ -72,9 +72,6 @@
                     each bottleneck to 0 (default: False)')
 parser.add_argument('--model', type=str, default='resnet50_v1',
                     help='type of model to use. see vision_model for options.')
-parser.add_argument('--mode', type=str, default='module',
-                    help='mode in which to train the model. options are \
-                    module, gluon (default: module)')
 parser.add_argument('--use-pretrained', action='store_true', default=False,
                     help='load pretrained model weights (default: False)')
 parser.add_argument('--no-cuda', action='store_true', default=False,
@@ -371,97 +368,6 @@ def evaluate(epoch):
     evaluate(epoch)
 
 
-def train_module():
-    # Create input symbol
-    data = mx.sym.var('data')
-    if args.dtype == 'float16':
-        data = mx.sym.Cast(data=data, dtype=np.float16)
-        net.cast(np.float16)
-
-    # Create output symbol
-    out = net(data)
-    if args.dtype == 'float16':
-        out = mx.sym.Cast(data=out, dtype=np.float32)
-    softmax = mx.sym.SoftmaxOutput(out, name='softmax')
-
-    # Create model
-    mod = mx.mod.Module(softmax, context=context)
-
-    # Initialize parameters
-    if args.use_pretrained:
-        arg_params = {}
-        for x in net.collect_params().values():
-            x.reset_ctx(mx.cpu())
-            arg_params[x.name] = x.data()
-    else:
-        arg_params = None
-    aux_params = None
-    mod.bind(data_shapes=train_data.provide_data,
-             label_shapes=train_data.provide_label)
-    mod.init_params(initializer, arg_params=arg_params, aux_params=aux_params)
-
-    # Horovod: fetch and broadcast parameters
-    (arg_params, aux_params) = mod.get_params()
-    if arg_params is not None:
-        hvd.broadcast_parameters(arg_params, root_rank=0)
-    if aux_params is not None:
-        hvd.broadcast_parameters(aux_params, root_rank=0)
-    mod.set_params(arg_params=arg_params, aux_params=aux_params)
-
-    # Create optimizer
-    # Note that when using Module API, we need to specify rescale_grad since
-    # we create optimizer first and wrap it with DistributedOptimizer. For
-    # Gluon API, it is handled in Trainer.step() function so there is no need
-    # to specify rescale_grad (see above train_gluon() function). 
-    optimizer_params = {'wd': args.wd,
-                        'momentum': args.momentum,
-                        'rescale_grad': 1.0 / batch_size,
-                        'lr_scheduler': lr_sched}
-    if args.dtype == 'float16':
-        optimizer_params['multi_precision'] = True
-    opt = mx.optimizer.create('sgd', **optimizer_params)
-
-    # Horovod: wrap optimizer with DistributedOptimizer
-    opt = hvd.DistributedOptimizer(opt)
-
-    # Setup validation data and callback during training
-    eval_data = None
-    if args.eval_epoch:
-        eval_data = val_data
-    batch_callback = None
-    if args.log_interval > 0 and rank == 0:
-        batch_callback = mx.callback.Speedometer(batch_size * num_workers,
-                                                 args.log_interval)
-
-    epoch_callback = None
-    if args.save_frequency > 0:
-        epoch_callback = mx.callback.do_checkpoint(
-            '%s-%d' % (args.model, rank),
-            period=args.save_frequency)
-
-    # Train model
-    mod.fit(train_data,
-            eval_data=eval_data,
-            num_epoch=args.num_epochs,
-            kvstore=None,
-            batch_end_callback=batch_callback,
-            epoch_end_callback=epoch_callback,
-            optimizer=opt)
-
-    # Evaluate performance if not using synthetic data
-    if args.use_rec:
-        acc_top1 = mx.gluon.metric.Accuracy()
-        acc_top5 = mx.gluon.metric.TopKAccuracy(5)
-        res = mod.score(val_data, [acc_top1, acc_top5])
-        for name, val in res:
-            logging.info('Epoch[%d] Rank[%d] Validation-%s=%f',
-                         args.num_epochs - 1, rank, name, val)
-
 
 if __name__ == '__main__':
-    if args.mode == 'module':
-        train_module()
-    elif args.mode == 'gluon':
-        train_gluon()
-    else:
-        raise ValueError('Invalid training mode.')
+    train_gluon()
diff --git a/example/gluon/image_classification.py b/example/gluon/image_classification.py
index de31b06655eb..33583ff20175 100644
--- a/example/gluon/image_classification.py
+++ b/example/gluon/image_classification.py
@@ -68,7 +68,7 @@
 parser.add_argument('--seed', type=int, default=123,
                     help='random seed to use. Default=123.')
 parser.add_argument('--mode', type=str,
-                    help='mode in which to train the model. options are symbolic, imperative, hybrid')
+                    help='mode in which to train the model. options are imperative, hybrid')
 parser.add_argument('--model', type=str, required=True,
                     help='type of model to use. see vision_model for options.')
 parser.add_argument('--use_thumbnail', action='store_true',
@@ -256,30 +256,9 @@ def main():
     if opt.builtin_profiler > 0:
         profiler.set_config(profile_all=True, aggregate_stats=True)
         profiler.set_state('run')
-    if opt.mode == 'symbolic':
-        data = mx.sym.var('data')
-        if opt.dtype == 'float16':
-            data = mx.sym.Cast(data=data, dtype=np.float16)
-        out = net(data)
-        if opt.dtype == 'float16':
-            out = mx.sym.Cast(data=out, dtype=np.float32)
-        softmax = mx.sym.SoftmaxOutput(out, name='softmax')
-        mod = mx.mod.Module(softmax, context=context)
-        train_data, val_data = get_data_iters(dataset, batch_size, opt)
-        mod.fit(train_data,
-                eval_data=val_data,
-                num_epoch=opt.epochs,
-                kvstore=kv,
-                batch_end_callback = mx.callback.Speedometer(batch_size, max(1, opt.log_interval)),
-                epoch_end_callback = mx.callback.do_checkpoint('image-classifier-%s'% opt.model),
-                optimizer = 'sgd',
-                optimizer_params = {'learning_rate': opt.lr, 'wd': opt.wd, 'momentum': opt.momentum, 'multi_precision': True},
-                initializer = mx.init.Xavier(magnitude=2))
-        mod.save_parameters('image-classifier-%s-%d-final.params'%(opt.model, opt.epochs))
-    else:
-        if opt.mode == 'hybrid':
-            net.hybridize()
-        train(opt, context)
+    if opt.mode == 'hybrid':
+        net.hybridize()
+    train(opt, context)
     if opt.builtin_profiler > 0:
         profiler.set_state('stop')
         print(profiler.dumps())
diff --git a/example/kaggle-ndsb2/Preprocessing.py b/example/kaggle-ndsb2/Preprocessing.py
deleted file mode 100644
index 29b4ba009a9a..000000000000
--- a/example/kaggle-ndsb2/Preprocessing.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-"""Preprocessing script.
-
-This script walks over the directories and dump the frames into a csv file
-"""
-import os
-import csv
-import sys
-import random
-import scipy
-import numpy as np
-import dicom
-from skimage import io, transform
-from joblib import Parallel, delayed
-import dill
-
-def mkdir(fname):
-   try:
-       os.mkdir(fname)
-   except:
-       pass
-
-def get_frames(root_path):
-   """Get path to all the frame in view SAX and contain complete frames"""
-   ret = []
-   for root, _, files in os.walk(root_path):
-       root=root.replace('\\','/')
-       files=[s for s in files if ".dcm" in s]
-       if len(files) == 0 or not files[0].endswith(".dcm") or root.find("sax") == -1:
-           continue
-       prefix = files[0].rsplit('-', 1)[0]
-       fileset = set(files)
-       expected = ["%s-%04d.dcm" % (prefix, i + 1) for i in range(30)]
-       if all(x in fileset for x in expected):
-           ret.append([root + "/" + x for x in expected])
-   # sort for reproduciblity
-   return sorted(ret, key = lambda x: x[0])
-
-
-def get_label_map(fname):
-   labelmap = {}
-   fi = open(fname)
-   fi.readline()
-   for line in fi:
-       arr = line.split(',')
-       labelmap[int(arr[0])] = line
-   return labelmap
-
-
-def write_label_csv(fname, frames, label_map):
-   fo = open(fname, "w")
-   for lst in frames:
-       index = int(lst[0].split("/")[3])
-       if label_map is not None:
-           fo.write(label_map[index])
-       else:
-           fo.write("%d,0,0\n" % index)
-   fo.close()
-
-
-def get_data(lst,preproc):
-   data = []
-   result = []
-   for path in lst:
-       f = dicom.read_file(path)
-       img = preproc(f.pixel_array.astype(float) / np.max(f.pixel_array))
-       dst_path = path.rsplit(".", 1)[0] + ".64x64.jpg"
-       scipy.misc.imsave(dst_path, img)
-       result.append(dst_path)
-       data.append(img)
-   data = np.array(data, dtype=np.uint8)
-   data = data.reshape(data.size)
-   data = np.array(data, dtype=np.str_)
-   data = data.reshape(data.size)
-   return [data,result]
-
-
-def write_data_csv(fname, frames, preproc):
-   """Write data to csv file"""
-   fdata = open(fname, "w")
-   dr = Parallel()(delayed(get_data)(lst,preproc) for lst in frames)
-   data,result = zip(*dr)
-   for entry in data:
-      fdata.write(','.join(entry)+'\r\n')
-   print("All finished, %d slices in total" % len(data))
-   fdata.close()
-   result = np.ravel(result)
-   return result
-
-
-def crop_resize(img, size):
-   """crop center and resize"""
-   if img.shape[0] < img.shape[1]:
-       img = img.T
-   # we crop image from center
-   short_egde = min(img.shape[:2])
-   yy = int((img.shape[0] - short_egde) / 2)
-   xx = int((img.shape[1] - short_egde) / 2)
-   crop_img = img[yy : yy + short_egde, xx : xx + short_egde]
-   # resize to 64, 64
-   resized_img = transform.resize(crop_img, (size, size))
-   resized_img *= 255
-   return resized_img.astype("uint8")
-
-
-def local_split(train_index):
-   random.seed(0)
-   train_index = set(train_index)
-   all_index = sorted(train_index)
-   num_test = int(len(all_index) / 3)
-   random.shuffle(all_index)
-   train_set = set(all_index[num_test:])
-   test_set = set(all_index[:num_test])
-   return train_set, test_set
-
-
-def split_csv(src_csv, split_to_train, train_csv, test_csv):
-   ftrain = open(train_csv, "w")
-   ftest = open(test_csv, "w")
-   cnt = 0
-   for l in open(src_csv):
-       if split_to_train[cnt]:
-           ftrain.write(l)
-       else:
-           ftest.write(l)
-       cnt = cnt + 1
-   ftrain.close()
-   ftest.close()
-
-# Load the list of all the training frames, and shuffle them
-# Shuffle the training frames
-random.seed(10)
-train_frames = get_frames("./data/train")
-random.shuffle(train_frames)
-validate_frames = get_frames("./data/validate")
-
-# Write the corresponding label information of each frame into file.
-write_label_csv("./train-label.csv", train_frames, get_label_map("./data/train.csv"))
-write_label_csv("./validate-label.csv", validate_frames, None)
-
-# Dump the data of each frame into a CSV file, apply crop to 64 preprocessor
-train_lst = write_data_csv("./train-64x64-data.csv", train_frames, lambda x: crop_resize(x, 64))
-valid_lst = write_data_csv("./validate-64x64-data.csv", validate_frames, lambda x: crop_resize(x, 64))
-
-# Generate local train/test split, which you could use to tune your model locally.
-train_index = np.loadtxt("./train-label.csv", delimiter=",")[:,0].astype("int")
-train_set, test_set = local_split(train_index)
-split_to_train = [x in train_set for x in train_index]
-split_csv("./train-label.csv", split_to_train, "./local_train-label.csv", "./local_test-label.csv")
-split_csv("./train-64x64-data.csv", split_to_train, "./local_train-64x64-data.csv", "./local_test-64x64-data.csv")
diff --git a/example/kaggle-ndsb2/README.md b/example/kaggle-ndsb2/README.md
deleted file mode 100644
index ffe1ca203d4a..000000000000
--- a/example/kaggle-ndsb2/README.md
+++ /dev/null
@@ -1,81 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# End-to-End Deep Learning Tutorial for Kaggle NDSB-II
-
-In this example, we will demo how to use MXNet to build an end-to-end deep learning system to help Diagnose Heart Disease.  The demo network is able to achieve 0.039222 CRPS on validation set, which is good enough to get Top-10 (on Dec 22nd, 2015).
-
-Notice this is a very simple model with no attempt to optimize the structure or hyper parameters, you can build fantastic network based on it. While this tutorial is written in python, mxnet comes with support for other popular languages such as R and Julia which can also be used. You are more than welcomed to try and contribute back to this example.
-
-This example requires GPU to train. If you are working with AWS,
-A simple guide to build MXNet on AWS and existing AMI can be found at [This document](https://mxnet.readthedocs.org/en/latest/aws.html).
-you can also choose to put your data on S3, and having all the machine directly load data from S3, without having to copy data over when you are starting new instances.
-
-
-## General Overview of model
-### Input Data
-We notice for in most of data, there are 30 frames for a sequence. A simple idea is pack this sequence into a multi-channel input, then let neural network learn from it. This tutorial is based on this idea: We first find accumulate all suitable data with 30 frames, then feed to the neural network to learn the target directly.
-
-Another idea is use difference to measure change in time-series. By using MXNet symbolic interface, we can build a dynamic difference channels to transform input inside of the network. It helps a little in the final result.
-
-### Network Objective
-For the network, we use a 20 years old LeNet style convolution network with batch normalization and dropout. We did not finetune the configuration and hyper parameters as this is mainly for demonstration purposes. We are sure better solutions can be found.
-
-One important idea of the model is to predict what the problem is asking for. In this problem, we are asked to predict a CDF value on 600 data-point. So we formulate the problem as a regression problem. We ask the neural-net to output 600 values, which corresponds to the CDF value to be predicted. The label is transformed into the 0-1 function as used in the evaluation target.
-
-
-## Preprocessing
-We first run a preprocessing step, to pack the data into a csv file. Each line of the csv file corresponds to a 30 x 64 x 64 tensor, which gives 30 frames of resized images. We can also use other inputs besides csv. We choose the csv because this format is quite common for all language and it is easy to manipulate.
-The input dataset is quite big. While they can fit into memory of a big machine, we want to be safe for all desktop settings, so we will use a CSVIter from mxnet to load data from disk on the fly during training, without loading all the data into memory. You are also more than welcomed to try the in-memory setting.
-
-
-
-## Step by step
-
-Prepare raw data in ```data``` folder. The tree of ```data``` folder is like
-
-```
--data
- |
- ---- sample_submission_validate.csv
- |
- ---- train.csv
- |
- ---- train
- |    |
- |    ---- 0
- |    |
- |    ---- …
- |
- ---- validate
-      |
-      ---- 501
-      |
-      ---- …
-```
-
-2. Run ```python3 Preprocessing.py``` to do preprocessing of data.
-3. We provide the R code with the network structure and parameters in ```Train.R```. Right now it used the pre-processed csv files by ```Preprocessing.py```.
-
-Note:
-- To modify network, change ```get_lenet``` function in ```Train.py``` or ```get.lenet``` function in ```Train.R```.
-- We also provide ```local_train```, ```local_test``` file for local parameter tuning.
-- To run on multiple GPU with huge network, or questions about saving network parameters etc, please refer [MXNet docs](https://mxnet.io/)
-
-
-## About MXNet
-MXNet is a deep learning framework designed for both efficiency and flexibility by DMLC group. Like all other packages in DMLC, it will fully utilize all the resources to solve the problem under limited resource constraint, with a flexible programming interface. You can use it for all purposes of data science and deep learning tasks with R, Julia, python and more.
diff --git a/example/kaggle-ndsb2/Train.R b/example/kaggle-ndsb2/Train.R
deleted file mode 100644
index 812acf1baa34..000000000000
--- a/example/kaggle-ndsb2/Train.R
+++ /dev/null
@@ -1,187 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Train.R for Second Annual Data Science Bowl
-# Deep learning model with GPU support
-# Please refer to https://mxnet.readthedocs.org/en/latest/build.html#r-package-installation
-# for installation guide
-
-require(mxnet)
-require(data.table)
-
-##A lenet style net, takes difference of each frame as input.
-get.lenet <- function() {
-  source <- mx.symbol.Variable("data")
-  source <- (source-128) / 128
-  frames <- mx.symbol.SliceChannel(source, num.outputs = 30)
-  diffs <- list()
-  for (i in 1:29) {
-    diffs <- c(diffs, frames[[i + 1]] - frames[[i]])
-  }
-  diffs$num.args = 29
-  source <- mxnet:::mx.varg.symbol.Concat(diffs)
-  net <-
-    mx.symbol.Convolution(source, kernel = c(5, 5), num.filter = 40)
-  net <- mx.symbol.BatchNorm(net, fix.gamma = TRUE)
-  net <- mx.symbol.Activation(net, act.type = "relu")
-  net <-
-    mx.symbol.Pooling(
-      net, pool.type = "max", kernel = c(2, 2), stride = c(2, 2)
-    )
-  net <-
-    mx.symbol.Convolution(net, kernel = c(3, 3), num.filter = 40)
-  net <- mx.symbol.BatchNorm(net, fix.gamma = TRUE)
-  net <- mx.symbol.Activation(net, act.type = "relu")
-  net <-
-    mx.symbol.Pooling(
-      net, pool.type = "max", kernel = c(2, 2), stride = c(2, 2)
-    )
-  # first fullc
-  flatten <- mx.symbol.Flatten(net)
-  flatten <- mx.symbol.Dropout(flatten)
-  fc1 <- mx.symbol.FullyConnected(data = flatten, num.hidden = 600)
-  # Name the final layer as softmax so it auto matches the naming of data iterator
-  # Otherwise we can also change the provide_data in the data iter
-  return(mx.symbol.LogisticRegressionOutput(data = fc1, name = 'softmax'))
-}
-
-network <- get.lenet()
-batch_size <- 32
-
-# CSVIter is uesed here, since the data can't fit into memory
-data_train <- mx.io.CSVIter(
-  data.csv = "./train-64x64-data.csv", data.shape = c(64, 64, 30),
-  label.csv = "./train-stytole.csv", label.shape = 600,
-  batch.size = batch_size
-)
-
-data_validate <- mx.io.CSVIter(
-  data.csv = "./validate-64x64-data.csv",
-  data.shape = c(64, 64, 30),
-  batch.size = 1
-)
-
-# Custom evaluation metric on CRPS.
-mx.metric.CRPS <- mx.metric.custom("CRPS", function(label, pred) {
-  pred <- as.array(pred)
-  label <- as.array(label)
-  for (i in 1:dim(pred)[2]) {
-    for (j in 1:(dim(pred)[1] - 1)) {
-      if (pred[j, i] > pred[j + 1, i]) {
-        pred[j + 1, i] = pred[j, i]
-      }
-    }
-  }
-  return(sum((label - pred) ^ 2) / length(label))
-})
-
-# Training the stytole net
-mx.set.seed(0)
-stytole_model <- mx.model.FeedForward.create(
-  X = data_train,
-  ctx = mx.gpu(0),
-  symbol = network,
-  num.round = 65,
-  learning.rate = 0.001,
-  wd = 0.00001,
-  momentum = 0.9,
-  eval.metric = mx.metric.CRPS
-)
-
-# Predict stytole
-stytole_prob = predict(stytole_model, data_validate)
-
-# Training the diastole net
-network = get.lenet()
-batch_size = 32
-data_train <-
-  mx.io.CSVIter(
-    data.csv = "./train-64x64-data.csv", data.shape = c(64, 64, 30),
-    label.csv = "./train-diastole.csv", label.shape = 600,
-    batch.size = batch_size
-  )
-
-diastole_model = mx.model.FeedForward.create(
-  X = data_train,
-  ctx = mx.gpu(0),
-  symbol = network,
-  num.round = 65,
-  learning.rate = 0.001,
-  wd = 0.00001,
-  momentum = 0.9,
-  eval.metric = mx.metric.CRPS
-)
-
-# Predict diastole
-diastole_prob = predict(diastole_model, data_validate)
-
-accumulate_result <- function(validate_lst, prob) {
-  t <- read.table(validate_lst, sep = ",")
-  p <- cbind(t[,1], t(prob))
-  dt <- as.data.table(p)
-  return(dt[, lapply(.SD, mean), by = V1])
-}
-
-stytole_result = as.data.frame(accumulate_result("./validate-label.csv", stytole_prob))
-diastole_result = as.data.frame(accumulate_result("./validate-label.csv", diastole_prob))
-
-train_csv <- read.table("./train-label.csv", sep = ',')
-
-# we have 2 person missing due to frame selection, use udibr's hist result instead
-doHist <- function(data) {
-  res <- rep(0, 600)
-  for (i in 1:length(data)) {
-    for (j in round(data[i]):600) {
-      res[j] = res[j] + 1
-    }
-  }
-  return(res / length(data))
-}
-
-hSystole = doHist(train_csv[, 2])
-hDiastole = doHist(train_csv[, 3])
-
-res <- read.table("data/sample_submission_validate.csv", sep = ",", header = TRUE, stringsAsFactors = FALSE)
-
-submission_helper <- function(pred) {
-  for (i in 2:length(pred)) {
-    if (pred[i] < pred[i - 1]) {
-      pred[i] = pred[i - 1]
-    }
-  }
-  return(pred)
-}
-
-for (i in 1:nrow(res)) {
-  key <- unlist(strsplit(res$Id[i], "_"))[1]
-  target <- unlist(strsplit(res$Id[i], "_"))[2]
-  if (key %in% stytole_result$V1) {
-    if (target == 'Diastole') {
-      res[i, 2:601] <- submission_helper(diastole_result[which(diastole_result$V1 == key), 2:601])
-    } else {
-      res[i, 2:601] <- submission_helper(stytole_result[which(stytole_result$V1 == key), 2:601])
-    }
-  } else {
-    if (target == 'Diastole') {
-      res[i, 2:601] <- hDiastole
-    } else {
-      res[i, 2:601] <- hSystole
-    }
-  }
-}
-
-write.table(res, file = "submission.csv", sep = ",", quote = FALSE, row.names = FALSE)
diff --git a/example/multi_threaded_inference/get_model.py b/example/multi_threaded_inference/get_model.py
deleted file mode 100644
index 36b36ff28d25..000000000000
--- a/example/multi_threaded_inference/get_model.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import logging
-import argparse
-import mxnet as mx
-import gluoncv
-
-
-models = ["imagenet1k-inception-bn", "imagenet1k-resnet-50",
-          "imagenet1k-resnet-152", "imagenet1k-resnet-18"]
-
-def main():
-    logging.basicConfig()
-    logger = logging.getLogger("logger")
-    logger.setLevel(logging.INFO)
-    parser = argparse.ArgumentParser(description='Download model hybridize and save as symbolic model for multithreaded inference')
-    parser.add_argument("--model", type=str, choices=models, required=True)
-    args = parser.parse_args()
-
-    mx.test_utils.download_model(args.model)
-
-if __name__ == "__main__":
-    main()
diff --git a/example/quantization/README.md b/example/quantization/README.md
deleted file mode 100644
index b934a811f31d..000000000000
--- a/example/quantization/README.md
+++ /dev/null
@@ -1,360 +0,0 @@
-# Model Quantization with Calibration Examples
-
-This folder contains examples of quantizing a FP32 model with Intel® MKL-DNN or CUDNN.
-
-<h2 id="0">Contents</h2>
-
-* [1. Model Quantization with Intel® MKL-DNN](#1)
-* [2. Model Quantization with CUDNN](#2)
-
-<h2 id="1">Model Quantization with Intel® MKL-DNN</h2>
-
-Intel® MKL-DNN supports quantization with subgraph features on Intel® CPU Platform and can bring performance improvements on the [Intel® Xeon® Scalable Platform](https://www.intel.com/content/www/us/en/processors/xeon/scalable/xeon-scalable-platform.html). A new quantization script `imagenet_gen_qsym_mkldnn.py` has been designed to launch quantization for image-classification models with Intel® MKL-DNN. This script integrates with [Gluon-CV modelzoo](https://gluon-cv.mxnet.io/model_zoo/classification.html), so that more pre-trained models can be downloaded from Gluon-CV and then converted for quantization. To apply quantization flow to your project directly, please refer [Quantize custom models with MKL-DNN backend](https://mxnet.apache.org/api/python/docs/tutorials/performance/backend/mkldnn/mkldnn_quantization.html).
-
-```
-usage: imagenet_gen_qsym_mkldnn.py [-h] [--model MODEL] [--epoch EPOCH]
-                                   [--no-pretrained] [--batch-size BATCH_SIZE]
-                                   [--label-name LABEL_NAME]
-                                   [--calib-dataset CALIB_DATASET]
-                                   [--image-shape IMAGE_SHAPE]
-                                   [--data-nthreads DATA_NTHREADS]
-                                   [--num-calib-batches NUM_CALIB_BATCHES]
-                                   [--exclude-first-conv] [--shuffle-dataset]
-                                   [--shuffle-chunk-seed SHUFFLE_CHUNK_SEED]
-                                   [--shuffle-seed SHUFFLE_SEED]
-                                   [--calib-mode CALIB_MODE]
-                                   [--quantized-dtype {auto,int8,uint8}]
-                                   [--enable-calib-quantize ENABLE_CALIB_QUANTIZE]
-
-Generate a calibrated quantized model from a FP32 model with Intel MKL-DNN
-support
-
-optional arguments:
-  -h, --help            show this help message and exit
-  --model MODEL         model to be quantized.
-  --epoch EPOCH         number of epochs, default is 0
-  --no-pretrained       If enabled, will not download pretrained model from
-                        MXNet or Gluon-CV modelzoo.
-  --batch-size BATCH_SIZE
-  --label-name LABEL_NAME
-  --calib-dataset CALIB_DATASET
-                        path of the calibration dataset
-  --image-shape IMAGE_SHAPE
-  --data-nthreads DATA_NTHREADS
-                        number of threads for data decoding
-  --num-calib-batches NUM_CALIB_BATCHES
-                        number of batches for calibration
-  --exclude-first-conv  excluding quantizing the first conv layer since the
-                        input data may have negative value which doesn't
-                        support at moment
-  --shuffle-dataset     shuffle the calibration dataset
-  --shuffle-chunk-seed SHUFFLE_CHUNK_SEED
-                        shuffling chunk seed, see https://mxnet.incubator.apac
-                        he.org/api/python/io/io.html?highlight=imager#mxnet.io
-                        .ImageRecordIter for more details
-  --shuffle-seed SHUFFLE_SEED
-                        shuffling seed, see https://mxnet.incubator.apache.org
-                        /api/python/io/io.html?highlight=imager#mxnet.io.Image
-                        RecordIter for more details
-  --calib-mode CALIB_MODE
-                        calibration mode used for generating calibration table
-                        for the quantized symbol; supports 1. none: no
-                        calibration will be used. The thresholds for
-                        quantization will be calculated on the fly. This will
-                        result in inference speed slowdown and loss of
-                        accuracy in general. 2. naive: simply take min and max
-                        values of layer outputs as thresholds for
-                        quantization. In general, the inference accuracy
-                        worsens with more examples used in calibration. It is
-                        recommended to use `entropy` mode as it produces more
-                        accurate inference results. 3. entropy: calculate KL
-                        divergence of the fp32 output and quantized output for
-                        optimal thresholds. This mode is expected to produce
-                        the best inference accuracy of all three kinds of
-                        quantized models if the calibration dataset is
-                        representative enough of the inference dataset.
-  --quantized-dtype {auto,int8,uint8}
-                        quantization destination data type for input data
-  --enable-calib-quantize ENABLE_CALIB_QUANTIZE
-                        If enabled, the quantize op will be calibrated offline
-                        if calibration mode is enabled
-```
-
-A new benchmark script `launch_inference_mkldnn.sh` has been designed to launch performance benchmark for float32 or int8 image-classification models with Intel® MKL-DNN.
-```
-usage: bash ./launch_inference_mkldnn.sh [[[-s symbol_file ] [-b batch_size] [-iter iteraton] [-ins instance] [-c cores/instance]] | [-h]]
-
-optional arguments:
-  -h, --help                show this help message and exit
-  -s, --symbol_file         symbol file for benchmark
-  -b, --batch_size          inference batch size
-                            default: 64
-  -iter, --iteration        inference iteration
-                            default: 500
-  -ins, --instance          launch multi-instance inference
-                            default: one instance per socket
-  -c, --core                number of cores per instance
-                            default: divide full physical cores
-
-example: resnet int8 performance benchmark on c5.24xlarge(duo sockets, 24 physical cores per socket).
-
-    bash ./launch_inference_mkldnn.sh -s ./model/resnet50_v1-quantized-5batches-naive-symbol.json
-
-will launch two instances for throughput benchmark and each instance will use 24 physical cores.
-```
-
-Use the following command to install [Gluon-CV](https://gluon-cv.mxnet.io/):
-
-```
-pip install gluoncv
-```
-
-The following models have been tested on Linux systems. Accuracy is collected on Intel XEON Cascade Lake CPU. For CPU with Skylake Lake or eariler architecture, the accuracy may not be the same.
-
-| Model | Source | Dataset | FP32 Accuracy (top-1/top-5)| INT8 Accuracy (top-1/top-5)|
-|:---|:---|---|:---:|:---:|
-| [ResNet18-V1](#3)  | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)  | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  |70.15%/89.38%|69.92%/89.30%|
-| [ResNet50-V1](#3)  | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)  | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  | 76.34%/93.13%  |  76.06%/92.99% |
-| [ResNet101-V1](#3)  | [Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)  | [Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)  | 77.33%/93.59%  | 77.07%/93.47%  |
-|[Squeezenet 1.0](#4)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|56.98%/79.20%|56.79%/79.47%|
-|[MobileNet 1.0](#5)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|72.23%/90.64%|72.06%/90.53%|
-|[MobileNetV2 1.0](#6)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|70.27%/89.62%|69.82%/89.35%|
-|[Inception V3](#7)|[Gluon-CV](https://gluon-cv.mxnet.io/model_zoo/classification.html)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|77.76%/93.83% |78.05%/93.91% |
-|[ResNet152-V2](#8)|[MXNet ModelZoo](http://data.mxnet.io/models/imagenet/resnet/152-layers/)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|76.65%/93.07%|76.25%/92.89%|
-|[Inception-BN](#9)|[MXNet ModelZoo](http://data.mxnet.io/models/imagenet/inception-bn/)|[Validation Dataset](http://data.mxnet.io/data/val_256_q90.rec)|72.28%/90.63%|72.02%/90.53%|
-| [SSD-VGG16](#10) | [example/ssd](https://github.com/apache/incubator-mxnet/tree/master/example/ssd)  | VOC2007/2012  | 0.8366 mAP  | 0.8357 mAP  |
-| [SSD-VGG16](#10) | [example/ssd](https://github.com/apache/incubator-mxnet/tree/master/example/ssd)  | COCO2014  | 0.2552 mAP  | 0.253 mAP  |
-
-<h3 id='3'>ResNetV1</h3>
-
-The following command is to download the pre-trained model from Gluon-CV and transfer it into the symbolic model which would be finally quantized. The [validation dataset](http://data.mxnet.io/data/val_256_q90.rec) is available for testing the pre-trained models:
-
-```
-python imagenet_gen_qsym_mkldnn.py --model=resnet50_v1 --num-calib-batches=5 --calib-mode=naive
-```
-
-The model would be automatically replaced in fusion and quantization format. It is then saved as the quantized symbol and parameter files in the `./model` directory. Set `--model` to `resnet18_v1/resnet50_v1b/resnet101_v1` to quantize other models. The following command is to launch inference.
-
-```
-# Launch FP32 Inference
-python imagenet_inference.py --symbol-file=./model/resnet50_v1-symbol.json --param-file=./model/resnet50_v1-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
-
-# Launch INT8 Inference
-python imagenet_inference.py --symbol-file=./model/resnet50_v1-quantized-5batches-naive-symbol.json --param-file=./model/resnet50_v1-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
-
-# Launch dummy data Inference
-bash ./launch_inference_mkldnn.sh -s ./model/resnet50_v1-symbol.json
-bash ./launch_inference_mkldnn.sh -s ./model/resnet50_v1-quantized-5batches-naive-symbol.json
-```
-
-<h3 id='4'>SqueezeNet 1.0</h3>
-
-The following command is to download the pre-trained model from Gluon-CV and transfer it into the symbolic model which would be finally quantized. The [validation dataset](http://data.mxnet.io/data/val_256_q90.rec) is available for testing the pre-trained models:
-
-```
-python imagenet_gen_qsym_mkldnn.py --model=squeezenet1.0 --num-calib-batches=5 --calib-mode=naive
-```
-The model would be automatically replaced in fusion and quantization format. It is then saved as the quantized symbol and parameter files in the `./model` directory. The following command is to launch inference.
-
-```
-
-# Launch FP32 Inference
-python imagenet_inference.py --symbol-file=./model/squeezenet1.0-symbol.json --param-file=./model/squeezenet1.0-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
-
-# Launch INT8 Inference
-python imagenet_inference.py --symbol-file=./model/squeezenet1.0-quantized-5batches-naive-symbol.json --param-file=./model/squeezenet1.0-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
-
-# Launch dummy data Inference
-bash ./launch_inference_mkldnn.sh -s ./model/squeezenet1.0-symbol.json
-bash ./launch_inference_mkldnn.sh -s ./model/squeezenet1.0-quantized-5batches-naive-symbol.json 
-```
-
-<h3 id='5'>MobileNet 1.0</h3>
-
-The following command is to download the pre-trained model from Gluon-CV and transfer it into the symbolic model which would be finally quantized. The [validation dataset](http://data.mxnet.io/data/val_256_q90.rec) is available for testing the pre-trained models:
-
-```
-python imagenet_gen_qsym_mkldnn.py --model=mobilenet1.0 --num-calib-batches=5 --calib-mode=naive
-```
-The model would be automatically replaced in fusion and quantization format. It is then saved as the quantized symbol and parameter files in the `./model` directory. The following command is to launch inference.
-
-```
-
-# Launch FP32 Inference
-python imagenet_inference.py --symbol-file=./model/mobilenet1.0-symbol.json --param-file=./model/mobilenet1.0-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
-
-# Launch INT8 Inference
-python imagenet_inference.py --symbol-file=./model/mobilenet1.0-quantized-5batches-naive-symbol.json --param-file=./model/mobilenet1.0-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
-
-# Launch dummy data Inference
-bash ./launch_inference_mkldnn.sh -s ./model/mobilenet1.0-symbol.json
-bash ./launch_inference_mkldnn.sh -s ./model/mobilenet1.0-quantized-5batches-naive-symbol.json
-```
-
-<h3 id='6'>MobileNetV2 1.0</h3>
-
-The following command is to download the pre-trained model from Gluon-CV and transfer it into the symbolic model which would be finally quantized. The [validation dataset](http://data.mxnet.io/data/val_256_q90.rec) is available for testing the pre-trained models:
-
-```
-python imagenet_gen_qsym_mkldnn.py --model=mobilenetv2_1.0 --num-calib-batches=5 --calib-mode=naive
-```
-The model would be automatically replaced in fusion and quantization format. It is then saved as the quantized symbol and parameter files in the `./model` directory. The following command is to launch inference.
-
-```
-
-# Launch FP32 Inference
-python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-symbol.json --param-file=./model/mobilenetv2_1.0-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
-
-# Launch INT8 Inference
-python imagenet_inference.py --symbol-file=./model/mobilenetv2_1.0-quantized-5batches-naive-symbol.json --param-file=./model/mobilenetv2_1.0-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
-
-# Launch dummy data Inference
-bash ./launch_inference_mkldnn.sh -s ./model/mobilenetv2_1.0-symbol.json
-bash ./launch_inference_mkldnn.sh -s ./model/mobilenetv2_1.0-quantized-5batches-naive-symbol.json
-```
-
-<h3 id='7'>Inception-V3</h3>
-
-The following command is to download the pre-trained model from Gluon-CV and transfer it into the symbolic model which would be finally quantized. The [validation dataset](http://data.mxnet.io/data/val_256_q90.rec) is available for testing the pre-trained models:
-
-```
-python imagenet_gen_qsym_mkldnn.py --model=inceptionv3 --image-shape=3,299,299 --num-calib-batches=5 --calib-mode=naive
-```
-The model would be automatically replaced in fusion and quantization format. It is then saved as the quantized symbol and parameter files in the `./model` directory. The following command is to launch inference.
-
-```
-
-# Launch FP32 Inference
-python imagenet_inference.py --symbol-file=./model/inceptionv3-symbol.json --param-file=./model/inceptionv3-0000.params --image-shape=3,299,299 --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
-
-# Launch INT8 Inference
-python imagenet_inference.py --symbol-file=./model/inceptionv3-quantized-5batches-naive-symbol.json --param-file=./model/inceptionv3-quantized-0000.params --image-shape=3,299,299 --rgb-mean=123.68,116.779,103.939 --rgb-std=58.393,57.12,57.375 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
-
-# Launch dummy data Inference
-bash ./launch_inference_mkldnn.sh -s ./model/inceptionv3-symbol.json
-bash ./launch_inference_mkldnn.sh -s ./model/inceptionv3-quantized-5batches-naive-symbol.json
-```
-
-<h3 id='8'>ResNet152-V2</h3>
-
-The following command is to download the pre-trained model from the [MXNet ModelZoo](http://data.mxnet.io/models/imagenet/resnet/152-layers/) which would be finally quantized. The [validation dataset](http://data.mxnet.io/data/val_256_q90.rec) is available for testing the pre-trained models:
-
-```
-python imagenet_gen_qsym_mkldnn.py --model=imagenet1k-resnet-152 --num-calib-batches=5 --calib-mode=naive
-```
-
-The model would be automatically replaced in fusion and quantization format. It is then saved as the quantized symbol and parameter files in the `./model` directory. The following command is to launch inference.
-
-```
-
-# Launch FP32 Inference
-python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --param-file=./model/imagenet1k-resnet-152-0000.params --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
-
-# Launch INT8 Inference
-python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-quantized-5batches-naive-symbol.json --param-file=./model/imagenet1k-resnet-152-quantized-0000.params --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
-
-# Launch dummy data Inference
-bash ./launch_inference_mkldnn.sh -s ./model/imagenet1k-resnet-152-symbol.json
-bash ./launch_inference_mkldnn.sh -s ./model/imagenet1k-resnet-152-quantized-5batches-naive-symbol.json
-```
-
-<h3 id='9'>Inception-BN</h3>
-
-The following command is to download the pre-trained model from the [MXNet ModelZoo](http://data.mxnet.io/models/imagenet/inception-bn/) which would be finally quantized. The [validation dataset](http://data.mxnet.io/data/val_256_q90.rec) is available for testing the pre-trained models:
-
-```
-python imagenet_gen_qsym_mkldnn.py --model=imagenet1k-inception-bn --num-calib-batches=5 --calib-mode=naive
-```
-
-The model would be automatically replaced in fusion and quantization format. It is then saved as the quantized symbol and parameter files in the `./model` directory. The following command is to launch inference.
-
-```
-
-# Launch FP32 Inference
-python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-symbol.json --param-file=./model/imagenet1k-inception-bn-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
-
-# Launch INT8 Inference
-python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-quantized-5batches-naive-symbol.json --param-file=./model/imagenet1k-inception-bn-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --batch-size=64 --num-inference-batches=500 --dataset=./data/val_256_q90.rec --ctx=cpu
-
-# Launch dummy data Inference
-bash ./launch_inference_mkldnn.sh -s ./model/imagenet1k-inception-bn-symbol.json
-bash ./launch_inference_mkldnn.sh -s ./model/imagenet1k-inception-bn-quantized-5batches-naive-symbol.json
-```
-
-<h3 id='10'>SSD-VGG16</h3>
-
-SSD model is located in [example/ssd](https://github.com/apache/incubator-mxnet/tree/master/example/ssd), follow [the insturctions](https://github.com/apache/incubator-mxnet/tree/master/example/ssd#quantize-model) to run quantized SSD model.
-
-<h3 id='11'>Custom Model</h3>
-
-This script also supports custom symbolic models. You can easily add some quantization layer configs in `imagenet_gen_qsym_mkldnn.py` like below:
-
-```
-else:
-    logger.info('Please set proper RGB configs for model %s' % args.model)
-    # add rgb mean/std of your model.
-    rgb_mean = '0,0,0'
-    rgb_std = '0,0,0'
-    # add layer names you donnot want to quantize.
-    logger.info('Please set proper excluded_sym_names for model %s' % args.model)
-    excluded_sym_names += ['layers']
-    if exclude_first_conv:
-        excluded_sym_names += ['layers']
-```
-
-Some tips on quantization configs:
-
-1. First, you should prepare your data, symbol file (custom-symbol.json) and parameter file (custom-0000.params) of your fp32 symbolic model.
-2. Then, you should run the following command and verify that your fp32 symbolic model runs inference as expected.
-
-```
-
-# Launch FP32 Inference
-python imagenet_inference.py --symbol-file=./model/custom-symbol.json --param-file=./model/custom-0000.params --rgb-mean=* --rgb-std=* --num-skipped-batches=* --batch-size=* --num-inference-batches=*--dataset=./data/* --ctx=cpu
-```
-
-3. Then, you should add `rgb_mean`, `rgb_std` and `excluded_sym_names` in this script.
-
-4. Then, you can run the following command for quantization:
-
-```
-python imagenet_gen_qsym_mkldnn.py --model=custom --num-calib-batches=5 --calib-mode=naive
-```
-
-5. After quantization, the quantized symbol and parameter files will be saved in the `model/` directory.
-
-6. Finally, you can run INT8 inference:
-
-```
-# Launch INT8 Inference
-python imagenet_inference.py --symbol-file=./model/*.json --param-file=./model/*.params --rgb-mean=* --rgb-std=* --num-skipped-batches=* --batch-size=* --num-inference-batches=*--dataset=./data/* --ctx=cpu
-
-# Launch dummy data Inference
-bash ./launch_inference_mkldnn.sh -s ./model/*.json
-```
-
-<h2 id="2">Model Quantization with CUDNN</h2>
-
-This folder contains examples of quantizing a FP32 model with or without calibration and using the calibrated
-quantized for inference. Two pre-trained imagenet models are taken as examples for quantization. One is
-[Resnet-152](http://data.mxnet.io/models/imagenet/resnet/152-layers/), and the other one is
-[Inception with BatchNorm](http://data.mxnet.io/models/imagenet/inception-bn/). The calibration dataset
-is the [validation dataset](http://data.mxnet.io/data/val_256_q90.rec) for testing the pre-trained models.
-
-Here are the details of the four files in this folder.
-- `imagenet_gen_qsym.py` This script provides an example of taking FP32 models and calibration dataset to generate
-calibrated quantized models. When launched for the first time, the script would download the user-specified model,
-either Resnet-152 or Inception,
-and calibration dataset into `model` and `data` folders, respectively. The generated quantized models can be found in
-the `model` folder.
-- `imagenet_inference.py` This script is used for calculating the accuracy of FP32 models or quantized models on the
-validation dataset which was downloaded for calibration in `imagenet_gen_qsym.py`.
-- `launch_quantize.sh` This is a shell script that generates various quantized models for Resnet-152 and
-Inception with BatchNorm with different configurations. Users can copy and paste the command from the script to
-the console to run model quantization for a specific configuration.
-- `launch_inference.sh` This is a shell script that calculate the accuracies of all the quantized models generated
-by invoking `launch_quantize.sh`.
-
-**NOTE**:
-- This example has only been tested on Linux systems.
-- Performance is expected to decrease with GPU, however the memory footprint of a quantized model is smaller. The purpose of the quantization implementation is to minimize accuracy loss when converting FP32 models to INT8. MXNet community is working on improving the performance.
diff --git a/example/quantization/common b/example/quantization/common
deleted file mode 120000
index cafb9140ab6a..000000000000
--- a/example/quantization/common
+++ /dev/null
@@ -1 +0,0 @@
-../image-classification/common
\ No newline at end of file
diff --git a/example/quantization/imagenet_gen_qsym.py b/example/quantization/imagenet_gen_qsym.py
deleted file mode 100644
index e8f84d52ff63..000000000000
--- a/example/quantization/imagenet_gen_qsym.py
+++ /dev/null
@@ -1,208 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import os
-import logging
-from common import modelzoo
-import mxnet as mx
-from mxnet.contrib.quantization import *
-
-
-def download_calib_dataset(dataset_url, calib_dataset, logger=None):
-    if logger is not None:
-        logger.info('Downloading calibration dataset from %s to %s' % (dataset_url, calib_dataset))
-    mx.test_utils.download(dataset_url, calib_dataset)
-
-
-def download_model(model_name, logger=None):
-    dir_path = os.path.dirname(os.path.realpath(__file__))
-    model_path = os.path.join(dir_path, 'model')
-    if logger is not None:
-        logger.info('Downloading model %s... into path %s' % (model_name, model_path))
-    return modelzoo.download_model(args.model, os.path.join(dir_path, 'model'))
-
-
-def save_symbol(fname, sym, logger=None):
-    if logger is not None:
-        logger.info('Saving symbol into file at %s' % fname)
-    sym.save(fname)
-
-
-def save_params(fname, arg_params, aux_params, logger=None):
-    if logger is not None:
-        logger.info('Saving params into file at %s' % fname)
-    save_dict = {('arg:%s' % k): v.as_in_context(cpu()) for k, v in arg_params.items()}
-    save_dict.update({('aux:%s' % k): v.as_in_context(cpu()) for k, v in aux_params.items()})
-    mx.nd.save(fname, save_dict)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Generate a calibrated quantized model from a FP32 model')
-    parser.add_argument('--ctx', type=str, default='gpu')
-    parser.add_argument('--model', type=str, choices=['imagenet1k-resnet-152', 'imagenet1k-inception-bn'],
-                        help='currently only supports imagenet1k-resnet-152 or imagenet1k-inception-bn')
-    parser.add_argument('--batch-size', type=int, default=32)
-    parser.add_argument('--label-name', type=str, default='softmax_label')
-    parser.add_argument('--calib-dataset', type=str, default='data/val_256_q90.rec',
-                        help='path of the calibration dataset')
-    parser.add_argument('--image-shape', type=str, default='3,224,224')
-    parser.add_argument('--data-nthreads', type=int, default=60,
-                        help='number of threads for data decoding')
-    parser.add_argument('--num-calib-batches', type=int, default=10,
-                        help='number of batches for calibration')
-    parser.add_argument('--exclude-first-conv', action='store_true', default=True,
-                        help='excluding quantizing the first conv layer since the'
-                             ' number of channels is usually not a multiple of 4 in that layer'
-                             ' which does not satisfy the requirement of cuDNN')
-    parser.add_argument('--shuffle-dataset', action='store_true', default=True,
-                        help='shuffle the calibration dataset')
-    parser.add_argument('--shuffle-chunk-seed', type=int, default=3982304,
-                        help='shuffling chunk seed, see'
-                             ' https://mxnet.apache.org/api/python/io/io.html?highlight=imager#mxnet.io.ImageRecordIter'
-                             ' for more details')
-    parser.add_argument('--shuffle-seed', type=int, default=48564309,
-                        help='shuffling seed, see'
-                             ' https://mxnet.apache.org/api/python/io/io.html?highlight=imager#mxnet.io.ImageRecordIter'
-                             ' for more details')
-    parser.add_argument('--calib-mode', type=str, default='entropy',
-                        help='calibration mode used for generating calibration table for the quantized symbol; supports'
-                             ' 1. none: no calibration will be used. The thresholds for quantization will be calculated'
-                             ' on the fly. This will result in inference speed slowdown and loss of accuracy'
-                             ' in general.'
-                             ' 2. naive: simply take min and max values of layer outputs as thresholds for'
-                             ' quantization. In general, the inference accuracy worsens with more examples used in'
-                             ' calibration. It is recommended to use `entropy` mode as it produces more accurate'
-                             ' inference results.'
-                             ' 3. entropy: calculate KL divergence of the fp32 output and quantized output for optimal'
-                             ' thresholds. This mode is expected to produce the best inference accuracy of all three'
-                             ' kinds of quantized models if the calibration dataset is representative enough of the'
-                             ' inference dataset.')
-    parser.add_argument('--quantized-dtype', type=str, default='int8',
-                        choices=['int8', 'uint8'],
-                        help='quantization destination data type for input data')
-    args = parser.parse_args()
-
-    if args.ctx == 'gpu':
-        ctx = mx.gpu(0)
-    elif args.ctx == 'cpu':
-        ctx = mx.cpu(0)
-    else:
-        raise ValueError('ctx %s is not supported in this script' % args.ctx)
-
-    logging.basicConfig()
-    logger = logging.getLogger('logger')
-    logger.setLevel(logging.INFO)
-
-    logger.info('shuffle_dataset=%s' % args.shuffle_dataset)
-
-    calib_mode = args.calib_mode
-    logger.info('calibration mode set to %s' % calib_mode)
-
-    # download calibration dataset
-    if calib_mode != 'none':
-        download_calib_dataset('http://data.mxnet.io/data/val_256_q90.rec', args.calib_dataset)
-
-    # download model
-    prefix, epoch = download_model(model_name=args.model, logger=logger)
-    sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
-
-    # get batch size
-    batch_size = args.batch_size
-    logger.info('batch size = %d for calibration' % batch_size)
-
-    # get number of batches for calibration
-    num_calib_batches = args.num_calib_batches
-    if calib_mode != 'none':
-        logger.info('number of batches = %d for calibration' % num_calib_batches)
-
-    # get number of threads for decoding the dataset
-    data_nthreads = args.data_nthreads
-
-    # get image shape
-    image_shape = args.image_shape
-
-    exclude_first_conv = args.exclude_first_conv
-    excluded_sym_names = []
-    excluded_op_names = []
-    if args.model == 'imagenet1k-resnet-152':
-        rgb_mean = '0,0,0'
-        excluded_sym_names += ['flatten0', 'fc1']
-        if exclude_first_conv:
-            excluded_sym_names += ['conv0']
-    elif args.model == 'imagenet1k-inception-bn':
-        rgb_mean = '123.68,116.779,103.939'
-        if args.ctx == 'cpu':
-            excluded_sym_names += ['flatten', 'fc1']
-        if exclude_first_conv:
-            excluded_sym_names += ['conv_1']
-    else:
-        raise ValueError('model %s is not supported in this script' % args.model)
-
-    label_name = args.label_name
-    logger.info('label_name = %s' % label_name)
-
-    data_shape = tuple([int(i) for i in image_shape.split(',')])
-    logger.info('Input data shape = %s' % str(data_shape))
-
-    logger.info('rgb_mean = %s' % rgb_mean)
-    rgb_mean = [float(i) for i in rgb_mean.split(',')]
-    mean_args = {'mean_r': rgb_mean[0], 'mean_g': rgb_mean[1], 'mean_b': rgb_mean[2]}
-
-    if calib_mode == 'none':
-        logger.info('Quantizing FP32 model %s' % args.model)
-        qsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
-                                                       ctx=ctx, excluded_sym_names=excluded_sym_names,
-                                                       excluded_op_names=excluded_op_names,
-                                                       calib_mode=calib_mode, quantized_dtype=args.quantized_dtype,
-                                                       logger=logger)
-        sym_name = '%s-symbol.json' % (prefix + '-quantized')
-        save_symbol(sym_name, qsym, logger)
-    else:
-        logger.info('Creating ImageRecordIter for reading calibration dataset')
-        data = mx.io.ImageRecordIter(path_imgrec=args.calib_dataset,
-                                     label_width=1,
-                                     preprocess_threads=data_nthreads,
-                                     batch_size=batch_size,
-                                     data_shape=data_shape,
-                                     label_name=label_name,
-                                     rand_crop=False,
-                                     rand_mirror=False,
-                                     shuffle=args.shuffle_dataset,
-                                     shuffle_chunk_seed=args.shuffle_chunk_seed,
-                                     seed=args.shuffle_seed,
-                                     **mean_args)
-
-        cqsym, qarg_params, aux_params = quantize_model(sym=sym, arg_params=arg_params, aux_params=aux_params,
-                                                        ctx=ctx, excluded_sym_names=excluded_sym_names,
-                                                        excluded_op_names=excluded_op_names,
-                                                        calib_mode=calib_mode, calib_data=data,
-                                                        num_calib_examples=num_calib_batches * batch_size,
-                                                        quantized_dtype=args.quantized_dtype,
-                                                        logger=logger)
-        if calib_mode == 'entropy':
-            suffix = '-quantized-%dbatches-entropy' % num_calib_batches
-        elif calib_mode == 'naive':
-            suffix = '-quantized-%dbatches-naive' % num_calib_batches
-        else:
-            raise ValueError('unknow calibration mode %s received, only supports `none`, `naive`, and `entropy`'
-                             % calib_mode)
-        sym_name = '%s-symbol.json' % (prefix + suffix)
-        save_symbol(sym_name, cqsym, logger)
-
-    param_name = '%s-%04d.params' % (prefix + '-quantized', epoch)
-    save_params(param_name, qarg_params, aux_params, logger)
diff --git a/example/quantization/imagenet_gen_qsym_mkldnn.py b/example/quantization/imagenet_gen_qsym_mkldnn.py
deleted file mode 100644
index 130282714e30..000000000000
--- a/example/quantization/imagenet_gen_qsym_mkldnn.py
+++ /dev/null
@@ -1,328 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import os
-import logging
-from common import modelzoo
-import mxnet as mx
-import gluoncv
-from mxnet import gluon, nd, image
-from gluoncv import utils
-from gluoncv.model_zoo import get_model
-from mxnet.contrib.quantization import *
-from mxnet.base import SymbolHandle, check_call, _LIB, mx_uint, c_str_array
-import ctypes
-
-
-def download_calib_dataset(dataset_url, calib_dataset, logger=None):
-    if logger is not None:
-        logger.info('Downloading calibration dataset from %s to %s' % (dataset_url, calib_dataset))
-    mx.test_utils.download(dataset_url, calib_dataset)
-
-
-def download_model(model_name, logger=None):
-    dir_path = os.path.dirname(os.path.realpath(__file__))
-    model_path = os.path.join(dir_path, 'model')
-    if logger is not None:
-        logger.info('Downloading model %s... into path %s' % (model_name, model_path))
-    return modelzoo.download_model(args.model, os.path.join(dir_path, 'model'))
-
-def convert_from_gluon(model_name, image_shape, classes=1000, logger=None):
-    dir_path = os.path.dirname(os.path.realpath(__file__))
-    model_path = os.path.join(dir_path, 'model')
-    if logger is not None:
-        logger.info('Converting model from Gluon-CV ModelZoo %s... into path %s' % (model_name, model_path))
-    net = get_model(name=model_name, classes=classes, pretrained=True)
-    net.hybridize()
-    x = mx.sym.var('data')
-    y = net(x)
-    y = mx.sym.SoftmaxOutput(data=y, name='softmax')
-    symnet = mx.symbol.load_json(y.tojson())
-    params = net.collect_params()
-    args = {}
-    auxs = {}
-    for param in params.values():
-        v = param._reduce()
-        k = param.name
-        if 'running' in k:
-            auxs[k] = v
-        else:
-            args[k] = v
-    mod = mx.mod.Module(symbol=symnet, context=mx.cpu(),
-                        label_names = ['softmax_label'])
-    mod.bind(for_training=False,
-             data_shapes=[('data', (1,) +
-                          tuple([int(i) for i in image_shape.split(',')]))])
-    mod.set_params(arg_params=args, aux_params=auxs)
-    dst_dir = os.path.join(dir_path, 'model')
-    prefix = os.path.join(dir_path, 'model', model_name)
-    if not os.path.isdir(dst_dir):
-        os.mkdir(dst_dir)
-    mod.save_checkpoint(prefix, 0)
-    return prefix
-
-def save_symbol(fname, sym, logger=None):
-    if logger is not None:
-        logger.info('Saving symbol into file at %s' % fname)
-    sym.save(fname)
-
-
-def save_params(fname, arg_params, aux_params, logger=None):
-    if logger is not None:
-        logger.info('Saving params into file at %s' % fname)
-    save_dict = {('arg:%s' % k): v.as_in_context(cpu()) for k, v in arg_params.items()}
-    save_dict.update({('aux:%s' % k): v.as_in_context(cpu()) for k, v in aux_params.items()})
-    mx.nd.save(fname, save_dict)
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Generate a calibrated quantized model from a FP32 model with Intel MKL-DNN support')
-    parser.add_argument('--model', type=str, default='resnet50_v1',
-                        help='model to be quantized.')
-    parser.add_argument('--epoch', type=int, default=0,
-                        help='number of epochs, default is 0')
-    parser.add_argument('--no-pretrained', action='store_true', default=False,
-                        help='If enabled, will not download pretrained model from MXNet or Gluon-CV modelzoo.')
-    parser.add_argument('--batch-size', type=int, default=32)
-    parser.add_argument('--label-name', type=str, default='softmax_label')
-    parser.add_argument('--calib-dataset', type=str, default='data/val_256_q90.rec',
-                        help='path of the calibration dataset')
-    parser.add_argument('--image-shape', type=str, default='3,224,224')
-    parser.add_argument('--data-nthreads', type=int, default=60,
-                        help='number of threads for data decoding')
-    parser.add_argument('--num-calib-batches', type=int, default=10,
-                        help='number of batches for calibration')
-    parser.add_argument('--exclude-first-conv', action='store_true', default=False,
-                        help='excluding quantizing the first conv layer since the'
-                             ' input data may have negative value which doesn\'t support at moment' )
-    parser.add_argument('--shuffle-dataset', action='store_true', default=True,
-                        help='shuffle the calibration dataset')
-    parser.add_argument('--shuffle-chunk-seed', type=int, default=3982304,
-                        help='shuffling chunk seed, see'
-                             ' https://mxnet.apache.org/api/python/io/io.html?highlight=imager#mxnet.io.ImageRecordIter'
-                             ' for more details')
-    parser.add_argument('--shuffle-seed', type=int, default=48564309,
-                        help='shuffling seed, see'
-                             ' https://mxnet.apache.org/api/python/io/io.html?highlight=imager#mxnet.io.ImageRecordIter'
-                             ' for more details')
-    parser.add_argument('--calib-mode', type=str, default='entropy',
-                        help='calibration mode used for generating calibration table for the quantized symbol; supports'
-                             ' 1. none: no calibration will be used. The thresholds for quantization will be calculated'
-                             ' on the fly. This will result in inference speed slowdown and loss of accuracy'
-                             ' in general.'
-                             ' 2. naive: simply take min and max values of layer outputs as thresholds for'
-                             ' quantization. In general, the inference accuracy worsens with more examples used in'
-                             ' calibration. It is recommended to use `entropy` mode as it produces more accurate'
-                             ' inference results.'
-                             ' 3. entropy: calculate KL divergence of the fp32 output and quantized output for optimal'
-                             ' thresholds. This mode is expected to produce the best inference accuracy of all three'
-                             ' kinds of quantized models if the calibration dataset is representative enough of the'
-                             ' inference dataset.')
-    parser.add_argument('--quantized-dtype', type=str, default='auto',
-                        choices=['auto', 'int8', 'uint8'],
-                        help='quantization destination data type for input data')
-    parser.add_argument('--enable-calib-quantize', type=bool, default=True,
-                        help='If enabled, the quantize op will '
-                             'be calibrated offline if calibration mode is '
-                             'enabled')
-    parser.add_argument('--quiet', action='store_true', default=False,
-                        help='suppress most of log')
-    args = parser.parse_args()
-    ctx = mx.cpu(0)
-    logger = None
-    if not args.quiet:
-        logging.basicConfig()
-        logger = logging.getLogger('logger')
-        logger.setLevel(logging.INFO)
-
-    if logger:
-        logger.info(args)
-        logger.info('shuffle_dataset=%s' % args.shuffle_dataset)
-
-    calib_mode = args.calib_mode
-    if logger:
-        logger.info('calibration mode set to %s' % calib_mode)
-
-    # download calibration dataset
-    if calib_mode != 'none':
-        download_calib_dataset('http://data.mxnet.io/data/val_256_q90.rec', args.calib_dataset)
-
-    # download model
-    if not args.no_pretrained:
-        if logger:
-            logger.info('Get pre-trained model from MXNet or Gluoncv modelzoo.')
-            logger.info('If you want to use custom model, please set --no-pretrained.')
-        if args.model in ['imagenet1k-resnet-152', 'imagenet1k-inception-bn']:
-            if logger:
-                logger.info('model %s is downloaded from MXNet modelzoo' % args.model)
-            prefix, epoch = download_model(model_name=args.model, logger=logger)
-        else:
-            if logger:
-                logger.info('model %s is converted from GluonCV' % args.model)
-            prefix = convert_from_gluon(model_name=args.model, image_shape=args.image_shape, classes=1000, logger=logger)
-            rgb_mean = '123.68,116.779,103.939'
-            rgb_std = '58.393, 57.12, 57.375'
-            epoch = 0
-    else:
-        dir_path = os.path.dirname(os.path.realpath(__file__))
-        prefix = os.path.join(dir_path, 'model', args.model)
-        epoch = args.epoch
-
-    sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
-
-    # get batch size
-    batch_size = args.batch_size
-    if logger:
-        logger.info('batch size = %d for calibration' % batch_size)
-
-    # get number of batches for calibration
-    num_calib_batches = args.num_calib_batches
-    if logger:
-        if calib_mode == 'none':
-            logger.info('skip calibration step as calib_mode is none')
-        else:
-            logger.info('number of batches = %d for calibration' % num_calib_batches)
-
-    # get number of threads for decoding the dataset
-    data_nthreads = args.data_nthreads
-
-    # get image shape
-    image_shape = args.image_shape
-
-    exclude_first_conv = args.exclude_first_conv
-    if args.quantized_dtype == "uint8":
-        if logger:
-            logger.info('quantized dtype is set to uint8, will exclude first conv.')
-        exclude_first_conv = True
-    excluded_sym_names = []
-    if not args.no_pretrained:
-        if args.model == 'imagenet1k-resnet-152':
-            rgb_mean = '0,0,0'
-            rgb_std = '1,1,1'
-            # stage1_unit1_bn1 & stage4_unit1_bn1 is excluded for the sake of accuracy
-            excluded_sym_names += ['flatten0', 'stage1_unit1_bn1', 'stage4_unit1_bn1']
-            if exclude_first_conv:
-                excluded_sym_names += ['conv0']
-        elif args.model == 'imagenet1k-inception-bn':
-            rgb_mean = '123.68,116.779,103.939'
-            rgb_std = '1,1,1'
-            excluded_sym_names += ['flatten']
-            if exclude_first_conv:
-                excluded_sym_names += ['conv_1']
-        elif args.model.find('resnet') != -1 and args.model.find('v1') != -1:
-            if exclude_first_conv:
-                excluded_sym_names += ['resnetv10_conv0_fwd']
-        elif args.model.find('resnet') != -1 and args.model.find('v2') != -1:
-            # resnetv20_stage1_batchnorm0_fwd is excluded for the sake of accuracy
-            excluded_sym_names += ['resnetv20_flatten0_flatten0', 'resnetv20_stage1_batchnorm0_fwd']
-            if exclude_first_conv:
-                excluded_sym_names += ['resnetv20_conv0_fwd']
-        elif args.model.find('vgg') != -1:
-            if exclude_first_conv:
-                excluded_sym_names += ['vgg0_conv0_fwd']
-        elif args.model.find('squeezenet1') != -1:
-            excluded_sym_names += ['squeezenet0_flatten0_flatten0']
-            if exclude_first_conv:
-                excluded_sym_names += ['squeezenet0_conv0_fwd']
-        elif args.model.find('mobilenet') != -1 and args.model.find('v2') == -1:
-            excluded_sym_names += ['mobilenet0_flatten0_flatten0',
-                                'mobilenet0_pool0_fwd']
-            if exclude_first_conv:
-                excluded_sym_names += ['mobilenet0_conv0_fwd']
-        elif args.model.find('mobilenet') != -1 and args.model.find('v2') != -1:
-            excluded_sym_names += ['mobilenetv20_output_flatten0_flatten0']
-            if exclude_first_conv:
-                excluded_sym_names += ['mobilenetv20_conv0_fwd']
-        elif args.model == 'inceptionv3':
-            if exclude_first_conv:
-                excluded_sym_names += ['inception30_conv0_fwd']
-        else:
-            raise ValueError('Currently, model %s is not supported in this script' % args.model)
-    else:
-        if logger:
-            logger.info('Please set proper RGB configs for model %s' % args.model)
-        # add rgb mean/std of your model.
-        rgb_mean = '0,0,0'
-        rgb_std = '0,0,0'
-        # add layer names you donnot want to quantize.
-        if logger:
-            logger.info('Please set proper excluded_sym_names for model %s' % args.model)
-        excluded_sym_names += ['layers']
-        if exclude_first_conv:
-            excluded_sym_names += ['layers']
-
-    if logger:
-        logger.info('These layers have been excluded %s' % excluded_sym_names)
-
-    label_name = args.label_name
-    if logger:
-        logger.info('label_name = %s' % label_name)
-
-    data_shape = tuple([int(i) for i in image_shape.split(',')])
-    if logger:
-        logger.info('Input data shape = %s' % str(data_shape))
-        logger.info('rgb_mean = %s' % rgb_mean)
-        logger.info('rgb_std = %s' % rgb_std)
-    rgb_mean = [float(i) for i in rgb_mean.split(',')]
-    mean_args = {'mean_r': rgb_mean[0], 'mean_g': rgb_mean[1], 'mean_b': rgb_mean[2]}
-    rgb_std = [float(i) for i in rgb_std.split(',')]
-    std_args = {'std_r': rgb_std[0], 'std_g': rgb_std[1], 'std_b': rgb_std[2]}
-    combine_mean_std = {}
-    combine_mean_std.update(mean_args)
-    combine_mean_std.update(std_args)
-    if calib_mode == 'none':
-        if logger:
-            logger.info('Quantizing FP32 model %s' % args.model)
-        qsym, qarg_params, aux_params = quantize_model_mkldnn(sym=sym, arg_params=arg_params, aux_params=aux_params,
-                                                              ctx=ctx, excluded_sym_names=excluded_sym_names,
-                                                              calib_mode=calib_mode, quantized_dtype=args.quantized_dtype,
-                                                              logger=logger)
-        sym_name = '%s-symbol.json' % (prefix + '-quantized')
-    else:
-        if logger:
-            logger.info('Creating ImageRecordIter for reading calibration dataset')
-        data = mx.io.ImageRecordIter(path_imgrec=args.calib_dataset,
-                                     label_width=1,
-                                     preprocess_threads=data_nthreads,
-                                     batch_size=batch_size,
-                                     data_shape=data_shape,
-                                     label_name=label_name,
-                                     rand_crop=False,
-                                     rand_mirror=False,
-                                     shuffle=args.shuffle_dataset,
-                                     shuffle_chunk_seed=args.shuffle_chunk_seed,
-                                     seed=args.shuffle_seed,
-                                     **combine_mean_std)
-
-        qsym, qarg_params, aux_params = quantize_model_mkldnn(sym=sym, arg_params=arg_params, aux_params=aux_params,
-                                                              ctx=ctx, excluded_sym_names=excluded_sym_names,
-                                                              calib_mode=calib_mode, calib_data=data,
-                                                              num_calib_examples=num_calib_batches * batch_size,
-                                                              quantized_dtype=args.quantized_dtype,
-                                                              label_names=(label_name,), logger=logger)
-        if calib_mode == 'entropy':
-            suffix = '-quantized-%dbatches-entropy' % num_calib_batches
-        elif calib_mode == 'naive':
-            suffix = '-quantized-%dbatches-naive' % num_calib_batches
-        else:
-            raise ValueError('unknow calibration mode %s received, only supports `none`, `naive`, and `entropy`'
-                             % calib_mode)
-        sym_name = '%s-symbol.json' % (prefix + suffix)
-    save_symbol(sym_name, qsym, logger)
-    param_name = '%s-%04d.params' % (prefix + '-quantized', epoch)
-    save_params(param_name, qarg_params, aux_params, logger)
diff --git a/example/quantization/imagenet_inference.py b/example/quantization/imagenet_inference.py
deleted file mode 100644
index 2f41fec2a9a3..000000000000
--- a/example/quantization/imagenet_inference.py
+++ /dev/null
@@ -1,307 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import argparse
-import logging
-import os
-import time
-import numpy as np
-import mxnet as mx
-from mxnet import nd
-from mxnet.contrib.quantization import *
-from mxnet.contrib import amp
-
-
-def download_dataset(dataset_url, dataset_dir, logger=None):
-    if logger is not None:
-        logger.info('Downloading dataset for inference from %s to %s' % (dataset_url, dataset_dir))
-    mx.test_utils.download(dataset_url, dataset_dir)
-
-
-def load_model(symbol_file, param_file, logger=None):
-    cur_path = os.path.dirname(os.path.realpath(__file__))
-    symbol_file_path = os.path.join(cur_path, symbol_file)
-    if logger is not None:
-        logger.info('Loading symbol from file %s' % symbol_file_path)
-    symbol = mx.sym.load(symbol_file_path)
-
-    param_file_path = os.path.join(cur_path, param_file)
-    if logger is not None:
-        logger.info('Loading params from file %s' % param_file_path)
-    save_dict = nd.load(param_file_path)
-    arg_params = {}
-    aux_params = {}
-    for k, v in save_dict.items():
-        tp, name = k.split(':', 1)
-        if tp == 'arg':
-            arg_params[name] = v
-        if tp == 'aux':
-            aux_params[name] = v
-    return symbol, arg_params, aux_params
-
-
-def advance_data_iter(data_iter, n):
-    assert n >= 0
-    if n == 0:
-        return data_iter
-    has_next_batch = True
-    while has_next_batch:
-        try:
-            data_iter.next()
-            n -= 1
-            if n == 0:
-                return data_iter
-        except StopIteration:
-            has_next_batch = False
-
-
-def score(sym, arg_params, aux_params, data, devs, label_name, max_num_examples, logger=None):
-    metrics = [mx.gluon.metric.create('acc'),
-               mx.gluon.metric.create('top_k_accuracy', top_k=5)]
-    if not isinstance(metrics, list):
-        metrics = [metrics, ]
-    mod = mx.mod.Module(symbol=sym, context=devs, label_names=[label_name, ])
-    mod.bind(for_training=False,
-             data_shapes=data.provide_data,
-             label_shapes=data.provide_label)
-    mod.set_params(arg_params, aux_params)
-
-    tic = time.time()
-    num = 0
-    for batch in data:
-        mod.forward(batch, is_train=False)
-        for m in metrics:
-            mod.update_metric(m, batch.label)
-        num += batch_size
-        if max_num_examples is not None and num >= max_num_examples:
-            break
-
-    speed = num / (time.time() - tic)
-
-    if logger is not None:
-        logger.info('Finished inference with %d images' % num)
-        logger.info('Finished with %f images per second', speed)
-        logger.warn('Note: GPU performance is expected to be slower than CPU. Please refer quantization/README.md for details')
-        for m in metrics:
-            logger.info(m.get())
-
-
-def low_precison_convert(model_name, low_precision, sym, arg_params, aux_params, excluded_sym_names=[]):
-    if low_precision == 'bfloat16':
-        if model_name.find('imagenet1k-resnet-152') != -1:
-            excluded_sym_names += ['conv0']
-        elif model_name.find('imagenet1k-inception-bn') != -1:
-            excluded_sym_names += ['conv_1']
-        elif model_name.find('resnet') != -1 and model_name.find('v1') != -1:
-            excluded_sym_names += ['resnetv10_conv0_fwd']
-        elif model_name.find('resnet') != -1 and model_name.find('v2') != -1:
-            excluded_sym_names += ['resnetv20_conv0_fwd']
-        elif model_name.find('vgg') != -1:
-            excluded_sym_names += ['vgg0_conv0_fwd']
-        elif model_name.find('squeezenet1') != -1:
-            excluded_sym_names += ['squeezenet0_conv0_fwd']
-        elif model_name.find('mobilenet') != -1 and model_name.find('v2') == -1:
-            excluded_sym_names += ['mobilenet0_conv0_fwd']
-        elif model_name.find('mobilenet') != -1 and model_name.find('v2') != -1:
-            excluded_sym_names += ['mobilenetv20_conv0_fwd']
-        elif model_name.find('inceptionv3') != -1:
-            excluded_sym_names += ['inception30_conv0_fwd']
-    return amp.convert_model(sym,
-                             arg_params,
-                             aux_params,
-                             target_dtype=low_precision,
-                             excluded_sym_names=excluded_sym_names,
-                             cast_optional_params=True)
-
-def benchmark_score(symbol_file, ctx, batch_size, num_batches, data_layer_type, low_precision, logger=None):
-    # get mod
-    cur_path = os.path.dirname(os.path.realpath(__file__))
-    symbol_file_path = os.path.join(cur_path, symbol_file)
-    if logger is not None:
-        logger.info('Loading symbol from file %s' % symbol_file_path)
-    sym = mx.sym.load(symbol_file_path)
-    mod = mx.mod.Module(symbol=sym, context=ctx)
-    if data_layer_type == "int8":
-        dshape = mx.io.DataDesc(name='data', shape=(
-            batch_size,) + data_shape, dtype=np.int8)
-    elif data_layer_type == 'uint8':
-        dshape = mx.io.DataDesc(name='data', shape=(
-            batch_size,) + data_shape, dtype=np.uint8)
-    else:  # float32
-        dshape = mx.io.DataDesc(name='data', shape=(
-            batch_size,) + data_shape, dtype=np.float32)
-    mod.bind(for_training=False,
-             inputs_need_grad=False,
-             data_shapes=[dshape])
-    mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
-
-    if low_precision:
-        arg_params, aux_params = mod.get_params()
-        sym, arg_params, aux_params = low_precison_convert(symbol_file,
-                                                           low_precision,
-                                                           sym, arg_params,
-                                                           aux_params)
-        mod = mx.mod.Module(symbol=sym, context=ctx)
-        mod.bind(for_training=False,
-                 inputs_need_grad=False,
-                 data_shapes=[dshape],
-                 label_shapes=[['softmax_label', (batch_size,)]])
-        mod.set_params(arg_params, aux_params)
-
-    # get data
-    if data_layer_type == "float32":
-        data = [mx.random.uniform(-1.0, 1.0, shape=shape, ctx=ctx, dtype=data_layer_type)
-                for _, shape in mod.data_shapes]
-    else:
-        data = [mx.nd.full(shape=shape, val=127, ctx=ctx, dtype=data_layer_type)
-                for _, shape in mod.data_shapes]
-    batch = mx.io.DataBatch(data, [])  # empty label
-
-    # run
-    dry_run = 5                 # use 5 iterations to warm up
-    for i in range(dry_run+num_batches):
-        if i == dry_run:
-            tic = time.time()
-        mod.forward(batch, is_train=False)
-        for output in mod.get_outputs():
-            output.wait_to_read()
-
-    # return num images per second
-    return num_batches*batch_size/(time.time() - tic)
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(description='Score a model on a dataset')
-    parser.add_argument('--ctx', type=str, default='gpu')
-    parser.add_argument('--benchmark', type=bool, default=False, help='dummy data benchmark')
-    parser.add_argument('--symbol-file', type=str, required=True, help='symbol file path')
-    parser.add_argument('--param-file', type=str, required=False, help='param file path')
-    parser.add_argument('--batch-size', type=int, default=32)
-    parser.add_argument('--label-name', type=str, default='softmax_label')
-    parser.add_argument('--dataset', type=str, required=False, help='dataset path')
-    parser.add_argument('--rgb-mean', type=str, default='0,0,0')
-    parser.add_argument('--rgb-std', type=str, default='1,1,1')
-    parser.add_argument('--image-shape', type=str, default='3,224,224')
-    parser.add_argument('--data-nthreads', type=int, default=60, help='number of threads for data decoding')
-    parser.add_argument('--num-skipped-batches', type=int, default=0, help='skip the number of batches for inference')
-    parser.add_argument('--num-inference-batches', type=int, required=True, help='number of images used for inference')
-    parser.add_argument('--shuffle-dataset', action='store_true', default=True,
-                        help='shuffle the calibration dataset')
-    parser.add_argument('--shuffle-chunk-seed', type=int, default=3982304,
-                        help='shuffling chunk seed, see'
-                             ' https://mxnet.apache.org/api/python/io/io.html?highlight=imager#mxnet.io.ImageRecordIter'
-                             ' for more details')
-    parser.add_argument('--shuffle-seed', type=int, default=48564309,
-                        help='shuffling seed, see'
-                             ' https://mxnet.apache.org/api/python/io/io.html?highlight=imager#mxnet.io.ImageRecordIter'
-                             ' for more details')
-    parser.add_argument('--data-layer-type', type=str, default='float32',
-                        choices=['float32', 'int8', 'uint8'],
-                        help='data type for data layer')
-    parser.add_argument('--low-precision', type=str, default='',
-                        choices=['', 'float16', 'bfloat16'],
-                        help='enable low precision')
-
-    args = parser.parse_args()
-
-    if args.ctx == 'gpu':
-        ctx = mx.gpu(0)
-    elif args.ctx == 'cpu':
-        ctx = mx.cpu(0)
-    else:
-        raise ValueError('ctx %s is not supported in this script' % args.ctx)
-
-    logging.basicConfig()
-    logger = logging.getLogger('logger')
-    logger.setLevel(logging.INFO)
-
-    symbol_file = args.symbol_file
-    param_file = args.param_file
-    data_nthreads = args.data_nthreads
-
-    batch_size = args.batch_size
-    logger.info('batch size = %d for inference' % batch_size)
-
-    rgb_mean = args.rgb_mean
-    logger.info('rgb_mean = %s' % rgb_mean)
-    rgb_mean = [float(i) for i in rgb_mean.split(',')]
-    mean_args = {'mean_r': rgb_mean[0], 'mean_g': rgb_mean[1], 'mean_b': rgb_mean[2]}
-    rgb_std = args.rgb_std
-    logger.info('rgb_std = %s' % rgb_std)
-    rgb_std = [float(i) for i in rgb_std.split(',')]
-    std_args = {'std_r': rgb_std[0], 'std_g': rgb_std[1], 'std_b': rgb_std[2]}
-    combine_mean_std = {}
-    combine_mean_std.update(mean_args)
-    combine_mean_std.update(std_args)
-
-    label_name = args.label_name
-    logger.info('label_name = %s' % label_name)
-
-    image_shape = args.image_shape
-    data_shape = tuple([int(i) for i in image_shape.split(',')])
-    logger.info('Input data shape = %s' % str(data_shape))
-
-    data_layer_type = args.data_layer_type
-
-    if args.low_precision:
-        if args.ctx == 'gpu':
-            assert args.low_precision == 'float16', "Not supported low-precision options for GPU."
-        elif args.ctx == 'cpu':
-            assert args.low_precision == 'bfloat16', "Not supported low-precision options for CPU."
-
-    if args.benchmark == False:
-        dataset = args.dataset
-        download_dataset('http://data.mxnet.io/data/val_256_q90.rec', dataset)
-        logger.info('Dataset for inference: %s' % dataset)
-
-        # creating data iterator
-        data = mx.io.ImageRecordIter(
-            path_imgrec=dataset,
-            label_width=1,
-            preprocess_threads=data_nthreads,
-            batch_size=batch_size,
-            data_shape=data_shape,
-            label_name=label_name,
-            rand_crop=False,
-            rand_mirror=False,
-            shuffle=args.shuffle_dataset,
-            shuffle_chunk_seed=args.shuffle_chunk_seed,
-            seed=args.shuffle_seed,
-            dtype=data_layer_type,
-            ctx=args.ctx,
-            **combine_mean_std)
-
-        # loading model
-        sym, arg_params, aux_params = load_model(symbol_file, param_file, logger)
-
-        if args.low_precision:
-            sym, arg_params, aux_params = low_precison_convert(symbol_file,
-                                                               args.low_precision,
-                                                               sym, arg_params,
-                                                               aux_params)
-        # make sure that fp32 inference works on the same images as calibrated quantized model
-        logger.info('Skipping the first %d batches' % args.num_skipped_batches)
-        data = advance_data_iter(data, args.num_skipped_batches)
-
-        num_inference_images = args.num_inference_batches * batch_size
-        logger.info('Running model %s for inference' % symbol_file)
-        score(sym, arg_params, aux_params, data, [ctx], label_name,
-            max_num_examples=num_inference_images, logger=logger)
-    else:
-        logger.info('Running model %s for inference' % symbol_file)
-        speed = benchmark_score(symbol_file, ctx, batch_size,
-                                args.num_inference_batches, data_layer_type, args.low_precision, logger)
-        logger.info('batch size %2d, image/sec: %f', batch_size, speed)
diff --git a/example/quantization/launch_inference.sh b/example/quantization/launch_inference.sh
deleted file mode 100755
index 8c839ba0f611..000000000000
--- a/example/quantization/launch_inference.sh
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/bin/sh
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -ex
-
-python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-symbol.json --param-file=./model/imagenet1k-resnet-152-0000.params --rgb-mean=0,0,0 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
-
-python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-quantized-symbol.json --param-file=./model/imagenet1k-resnet-152-quantized-0000.params --rgb-mean=0,0,0 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
-
-python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-quantized-5batches-naive-symbol.json --param-file=./model/imagenet1k-resnet-152-quantized-0000.params --rgb-mean=0,0,0 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
-python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-quantized-10batches-naive-symbol.json --param-file=./model/imagenet1k-resnet-152-quantized-0000.params --rgb-mean=0,0,0 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
-python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-quantized-50batches-naive-symbol.json --param-file=./model/imagenet1k-resnet-152-quantized-0000.params --rgb-mean=0,0,0 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
-
-python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-quantized-5batches-entropy-symbol.json --param-file=./model/imagenet1k-resnet-152-quantized-0000.params --rgb-mean=0,0,0 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
-python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-quantized-10batches-entropy-symbol.json --param-file=./model/imagenet1k-resnet-152-quantized-0000.params --rgb-mean=0,0,0 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
-python imagenet_inference.py --symbol-file=./model/imagenet1k-resnet-152-quantized-50batches-entropy-symbol.json --param-file=./model/imagenet1k-resnet-152-quantized-0000.params --rgb-mean=0,0,0 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
-
-
-python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-symbol.json --param-file=./model/imagenet1k-inception-bn-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
-
-python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-quantized-symbol.json --param-file=./model/imagenet1k-inception-bn-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
-
-python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-quantized-5batches-naive-symbol.json --param-file=./model/imagenet1k-inception-bn-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
-python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-quantized-10batches-naive-symbol.json --param-file=./model/imagenet1k-inception-bn-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
-python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-quantized-50batches-naive-symbol.json --param-file=./model/imagenet1k-inception-bn-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
-
-python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-quantized-5batches-entropy-symbol.json --param-file=./model/imagenet1k-inception-bn-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
-python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-quantized-10batches-entropy-symbol.json --param-file=./model/imagenet1k-inception-bn-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
-python imagenet_inference.py --symbol-file=./model/imagenet1k-inception-bn-quantized-50batches-entropy-symbol.json --param-file=./model/imagenet1k-inception-bn-quantized-0000.params --rgb-mean=123.68,116.779,103.939 --num-skipped-batches=50 --num-inference-batches=500 --dataset=./data/val_256_q90.rec
diff --git a/example/quantization/launch_inference_mkldnn.sh b/example/quantization/launch_inference_mkldnn.sh
deleted file mode 100755
index 8491506fc8f6..000000000000
--- a/example/quantization/launch_inference_mkldnn.sh
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/bin/sh
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-usage()
-{
-    echo "usage: bash ./launch_inference_mkldnn.sh [[[-s symbol_file ] [-b batch_size] [-iter iteraton] [-ins instance] [-c cores/instance]] | [-h]]"
-}
-
-while [ $# -gt 0 ]; do
-  case "$1" in
-    --symbol | -s)
-      shift
-      SYMBOL=$1
-      ;;
-    --batch-size | -b)
-      shift
-      BS=$1
-      ;;
-    --iteration | -iter)
-      shift
-      ITERATIONS=$1
-      ;;
-    --instance | -ins)
-      shift
-      INS=$1
-      ;;
-    --core | -c)
-      shift
-      CORES=$1
-      ;;
-    --help | -h)
-      usage
-      exit 1
-      ;;
-    *)
-      usage
-      exit 1
-  esac
-  shift
-done
-
-NUM_SOCKET=`lscpu | grep 'Socket(s)' | awk '{print $NF}'`
-NUM_NUMA_NODE=`lscpu | grep 'NUMA node(s)' | awk '{print $NF}'`
-CORES_PER_SOCKET=`lscpu | grep 'Core(s) per socket' | awk '{print $NF}'`
-NUM_CORES=$((CORES_PER_SOCKET * NUM_SOCKET))
-CORES_PER_NUMA=$((NUM_CORES / NUM_NUMA_NODE))
-echo "target machine has $NUM_CORES physical core(s) on $NUM_NUMA_NODE numa nodes of $NUM_SOCKET socket(s)."
-
-if [ -z $SYMBOL ]; then
-  echo "Error: Need a symbol file as input."
-fi
-if [ -z $INS ]; then
-  echo "Default: launch one instance per socket."
-  INS=$NUM_SOCKET
-fi
-if [ -z $CORES ]; then
-  echo "Default: divide full physical cores."
-  CORES=$((NUM_CORES / $INS))
-fi
-if [ -z $BS ]; then
-  echo "Default: set batch size to 64."
-  BS=64
-fi
-if [ -z $ITERATIONS ]; then
-  echo "Default: set iterations to 500."
-  ITERATIONS=500
-fi
-
-echo "  benchmark configs"
-echo "  cores per instance: $CORES"
-echo "  total instances: $INS"
-echo "  batch size: $BS"
-echo "  iterations: $ITERATIONS"
-echo ""
-
-rm BENCHMARK_*.log  || echo "benchmarking..."
-
-for((i=0;i<$INS;i++));
-do
-  ((a=$i*$CORES))
-  ((b=$a+$CORES-1))
-  memid=$((b/CORES_PER_NUMA % NUM_NUMA_NODE))
-  LOG=BENCHMARK_$i.log
-  echo "  Instance $i use $a-$b cores and mem $memid with $LOG"
-  KMP_AFFINITY=granularity=fine,noduplicates,compact,1,0 \
-  OMP_NUM_THREADS=$CORES \
-  nohup numactl --physcpubind=$a-$b --membind=$memid python imagenet_inference.py --symbol-file=$SYMBOL --batch-size=$BS --num-inference-batches=$ITERATIONS --ctx=cpu --benchmark=True > $LOG 2>&1 &
-done
-wait
-
-fps=`grep image/sec BENCHMARK_*.log | awk '{ sum += $(NF) }; END { print sum }'`
-latency=$(awk "BEGIN {printf \"%.2f\", 1000*${BS}*${INS}/${fps}}")
-echo "overall throughput (image/sec): $fps"
-echo "latency per batch per instance (ms): $latency"
-echo "benchmark finish:)"
diff --git a/example/quantization/launch_quantize.sh b/example/quantization/launch_quantize.sh
deleted file mode 100755
index 9aa4bee4bff1..000000000000
--- a/example/quantization/launch_quantize.sh
+++ /dev/null
@@ -1,41 +0,0 @@
-#!/bin/sh
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -ex
-
-python imagenet_gen_qsym.py --model=imagenet1k-resnet-152 --calib-mode=none
-
-python imagenet_gen_qsym.py --model=imagenet1k-resnet-152 --calib-dataset=./data/val_256_q90.rec --num-calib-batches=5 --calib-mode=naive
-python imagenet_gen_qsym.py --model=imagenet1k-resnet-152 --calib-dataset=./data/val_256_q90.rec --num-calib-batches=10 --calib-mode=naive
-python imagenet_gen_qsym.py --model=imagenet1k-resnet-152 --calib-dataset=./data/val_256_q90.rec --num-calib-batches=50 --calib-mode=naive
-
-python imagenet_gen_qsym.py --model=imagenet1k-resnet-152 --calib-dataset=./data/val_256_q90.rec --num-calib-batches=5 --calib-mode=entropy
-python imagenet_gen_qsym.py --model=imagenet1k-resnet-152 --calib-dataset=./data/val_256_q90.rec --num-calib-batches=10 --calib-mode=entropy
-python imagenet_gen_qsym.py --model=imagenet1k-resnet-152 --calib-dataset=./data/val_256_q90.rec --num-calib-batches=50 --calib-mode=entropy
-
-
-python imagenet_gen_qsym.py --model=imagenet1k-inception-bn --calib-mode=none
-
-python imagenet_gen_qsym.py --model=imagenet1k-inception-bn --calib-dataset=./data/val_256_q90.rec --num-calib-batches=5 --calib-mode=naive
-python imagenet_gen_qsym.py --model=imagenet1k-inception-bn --calib-dataset=./data/val_256_q90.rec --num-calib-batches=10 --calib-mode=naive
-python imagenet_gen_qsym.py --model=imagenet1k-inception-bn --calib-dataset=./data/val_256_q90.rec --num-calib-batches=50 --calib-mode=naive
-
-python imagenet_gen_qsym.py --model=imagenet1k-inception-bn --calib-dataset=./data/val_256_q90.rec --num-calib-batches=5 --calib-mode=entropy
-python imagenet_gen_qsym.py --model=imagenet1k-inception-bn --calib-dataset=./data/val_256_q90.rec --num-calib-batches=10 --calib-mode=entropy
-python imagenet_gen_qsym.py --model=imagenet1k-inception-bn --calib-dataset=./data/val_256_q90.rec --num-calib-batches=50 --calib-mode=entropy
diff --git a/example/recommenders/demo-MF.R b/example/recommenders/demo-MF.R
deleted file mode 100644
index 82c0aae9c62c..000000000000
--- a/example/recommenders/demo-MF.R
+++ /dev/null
@@ -1,84 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-library(mxnet)
-DF <- read.table("./ml-100k/u.data", header = F, sep = "\t")
-names(DF) <- c("user", "item", "score", "time")
-max_user <- max(DF$user)
-max_item <- max(DF$item)
-DF_mat_x <- data.matrix(t(DF[, 1:2]))
-DF_y <- DF[, 3]
-k <- 64
-user <- mx.symbol.Variable("user")
-item <- mx.symbol.Variable("item")
-score <- mx.symbol.Variable("label")
-user1 <-mx.symbol.Embedding(data = mx.symbol.BlockGrad(user), input_dim = max_user,
-                            output_dim = k, name = "user1")
-item1 <- mx.symbol.Embedding(data = mx.symbol.BlockGrad(item), input_dim = max_item,
-                             output_dim = k, name = "item1")
-pred <- user1 * item1
-pred1 <- mx.symbol.sum_axis(pred, axis = 1, name = "pred1")
-pred2 <- mx.symbol.Flatten(pred1, name = "pred2")
-pred3 <- mx.symbol.LinearRegressionOutput(data = pred2, label = score, name = "pred3")
-devices <- mx.cpu()
-mx.set.seed(123)
-
-CustomIter <- setRefClass("CustomIter", fields = c("iter1", "iter2"),
-  contains = "Rcpp_MXArrayDataIter",
-  methods = list(
-    initialize = function(iter1, iter2) {
-      .self$iter1 <- iter1
-      .self$iter2 <- iter2
-      .self
-    },
-    value = function() {
-      user <- .self$iter1$value()$data
-      item <- .self$iter2$value()$data
-      label <- .self$iter1$value()$label
-      list(user = user,
-           item = item,
-           label = label)
-    },
-    iter.next = function() {
-      .self$iter1$iter.next()
-      .self$iter2$iter.next()
-    },
-    reset = function() {
-      .self$iter1$reset()
-      .self$iter2$reset()
-    },
-    num.pad = function() {
-      .self$iter1$num.pad()
-    },
-    finalize = function() {
-      .self$iter1$finalize()
-      .self$iter2$finalize()
-    }
-  )
-)
-
-user_iter = mx.io.arrayiter(data = DF[, 1], label = DF[, 3], batch.size = k)
-
-item_iter = mx.io.arrayiter(data = DF[, 2], label = DF[, 3], batch.size = k)
-
-train_iter <- CustomIter$new(user_iter, item_iter)
-
-model <- mx.model.FeedForward.create(pred3, X = train_iter, ctx = devices,
-                                     num.round = 10, initializer = mx.init.uniform(0.07),
-                                     learning.rate = 0.07, eval.metric = mx.metric.rmse,
-                                     momentum = 0.9, epoch.end.callback = mx.callback.log.train.metric(1),
-                                     input.names = c("user", "item"), output.names = "label")
diff --git a/julia/README.md b/julia/README.md
index 43b8deb0666a..405f662d034c 100644
--- a/julia/README.md
+++ b/julia/README.md
@@ -25,53 +25,5 @@ MXNet.jl is the [Apache MXNet](https://github.com/apache/incubator-mxnet) [Julia
 * Efficient tensor/matrix computation across multiple devices, including multiple CPUs, GPUs and distributed server nodes.
 * Flexible symbolic manipulation to composite and construction of state-of-the-art deep learning models.
 
-Here is an example of how training a simple 3-layer MLP on MNIST:
-
-```julia
-using MXNet
-
-mlp = @mx.chain mx.Variable(:data)             =>
-  mx.FullyConnected(name=:fc1, num_hidden=128) =>
-  mx.Activation(name=:relu1, act_type=:relu)   =>
-  mx.FullyConnected(name=:fc2, num_hidden=64)  =>
-  mx.Activation(name=:relu2, act_type=:relu)   =>
-  mx.FullyConnected(name=:fc3, num_hidden=10)  =>
-  mx.SoftmaxOutput(name=:softmax)
-
-# data provider
-batch_size = 100
-include(Pkg.dir("MXNet", "examples", "mnist", "mnist-data.jl"))
-train_provider, eval_provider = get_mnist_providers(batch_size)
-
-# setup model
-model = mx.FeedForward(mlp, context=mx.cpu())
-
-# optimization algorithm
-# where η is learning rate and μ is momentum
-optimizer = mx.SGD(η=0.1, μ=0.9)
-
-# fit parameters
-mx.fit(model, optimizer, train_provider, n_epoch=20, eval_data=eval_provider)
-```
-
-You can also predict using the `model` in the following way:
-
-```julia
-probs = mx.predict(model, eval_provider)
-
-# collect all labels from eval data
-labels = reduce(
-  vcat,
-  copy(mx.get(eval_provider, batch, :softmax_label)) for batch ∈ eval_provider)
-# labels are 0...9
-labels .= labels .+ 1
-
-# Now we use compute the accuracy
-pred = map(i -> argmax(probs[1:10, i]), 1:size(probs, 2))
-correct = sum(pred .== labels)
-accuracy = 100correct/length(labels)
-@printf "Accuracy on eval set: %.2f%%\n" accuracy
-```
-
 For more details, please refer to the
 [documentation](https://dmlc.github.io/MXNet.jl/latest) and [examples](examples).
diff --git a/julia/docs/mkdocs.yml b/julia/docs/mkdocs.yml
index 880fad24d5b8..6ab34726e7ef 100644
--- a/julia/docs/mkdocs.yml
+++ b/julia/docs/mkdocs.yml
@@ -43,9 +43,6 @@ docs_dir: 'build'
 
 nav:
   - Home: index.md
-  - Tutorial:
-    - Digit Recognition on MNIST: tutorial/mnist.md
-    - Generating Random Sentence with LSTM RNN: tutorial/char-lstm.md
   - User Guide:
     - Installation Guide: user-guide/install.md
     - Overview: user-guide/overview.md
diff --git a/julia/docs/src/index.md b/julia/docs/src/index.md
index 4213265b4bd4..e21bc165457a 100644
--- a/julia/docs/src/index.md
+++ b/julia/docs/src/index.md
@@ -31,16 +31,6 @@ include:
 For more details, see documentation below. Please also checkout the
 [examples](https://github.com/apache/incubator-mxnet/tree/master/julia/examples) directory.
 
-## Tutorials
-
-```@contents
-Pages = [
-  "tutorial/mnist.md",
-  "tutorial/char-lstm.md",
-]
-Depth = 2
-```
-
 ## User's Guide
 
 ```@contents
diff --git a/julia/docs/src/tutorial/char-lstm.md b/julia/docs/src/tutorial/char-lstm.md
deleted file mode 100644
index c7dc9d6c07db..000000000000
--- a/julia/docs/src/tutorial/char-lstm.md
+++ /dev/null
@@ -1,323 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-Generating Random Sentence with LSTM RNN
-========================================
-
-This tutorial shows how to train a LSTM (Long short-term memory) RNN
-(recurrent neural network) to perform character-level sequence training
-and prediction. The original model, usually called `char-rnn` is
-described in [Andrej Karpathy's
-blog](http://karpathy.github.io/2015/05/21/rnn-effectiveness/), with a
-reference implementation in Torch available
-[here](https://github.com/karpathy/char-rnn).
-
-Because MXNet.jl does not have a specialized model for recurrent neural
-networks yet, the example shown here is an implementation of LSTM by
-using the default FeedForward model via explicitly unfolding over time.
-We will be using fixed-length input sequence for training. The code is
-adapted from the [char-rnn example for MXNet's Python
-binding](https://github.com/apache/incubator-mxnet/blob/8004a027ad6a73f8f6eae102de8d249fbdfb9a2d/example/rnn/old/char-rnn.ipynb),
-which demonstrates how to use low-level
-[Symbolic API](@ref) to build customized neural
-network models directly.
-
-The most important code snippets of this example is shown and explained
-here. To see and run the complete code, please refer to the
-[examples/char-lstm](https://github.com/apache/incubator-mxnet/blob/master/julia/docs/src/tutorial/char-lstm.md)
-directory. You will need to install
-[Iterators.jl](https://github.com/JuliaLang/Iterators.jl) and
-[StatsBase.jl](https://github.com/JuliaStats/StatsBase.jl) to run this
-example.
-
-LSTM Cells
-----------
-
-Christopher Olah has a [great blog post about LSTM](http://colah.github.io/posts/2015-08-Understanding-LSTMs/) with
-beautiful and clear illustrations. So we will not repeat the definition
-and explanation of what an LSTM cell is here. Basically, an LSTM cell
-takes input `x`, as well as previous states (including `c` and `h`), and
-produce the next states. We define a helper type to bundle the two state
-variables together:
-
-Because LSTM weights are shared at every time when we do explicit
-unfolding, so we also define a helper type to hold all the weights (and
-bias) for an LSTM cell for convenience.
-
-Note all the variables are of type SymbolicNode. We will construct the
-LSTM network as a symbolic computation graph, which is then instantiated
-with NDArray for actual computation.
-
-The following figure is stolen (permission requested) from [Christopher
-Olah's blog](http://colah.github.io/posts/2015-08-Understanding-LSTMs/),
-which illustrate exactly what the code snippet above is doing.
-
-![image](images/LSTM3-chain.png)
-
-In particular, instead of defining the four gates independently, we do
-the computation together and then use SliceChannel to split them into
-four outputs. The computation of gates are all done with the symbolic
-API. The return value is a LSTM state containing the output of a LSTM
-cell.
-
-Unfolding LSTM
---------------
-
-Using the LSTM cell defined above, we are now ready to define a function
-to unfold a LSTM network with L layers and T time steps. The first part
-of the function is just defining all the symbolic variables for the
-shared weights and states.
-
-The `embed_W` is the weights used for character embedding --- i.e.
-mapping the one-hot encoded characters into real vectors. The `pred_W`
-and `pred_b` are weights and bias for the final prediction at each time
-step.
-
-Then we define the weights for each LSTM cell. Note there is one cell
-for each layer, and it will be replicated (unrolled) over time. The
-states are, however, *not* shared over time. Instead, here we define the
-initial states here at the beginning of a sequence, and we will update
-them with the output states at each time step as we explicitly unroll
-the LSTM.
-
-Unrolling over time is a straightforward procedure of stacking the
-embedding layer, and then LSTM cells, on top of which the prediction
-layer. During unrolling, we update the states and collect all the
-outputs. Note each time step takes data and label as inputs. If the LSTM
-is named as `:ptb`, the data and label at step `t` will be named
-`:ptb_data_$t` and `:ptb_label_$t`. Late on when we prepare the data, we
-will define the data provider to match those names.
-
-Note at each time step, the prediction is connected to a SoftmaxOutput
-operator, which could back propagate when corresponding labels are
-provided. The states are then connected to the next time step, which
-allows back propagate through time. However, at the end of the sequence,
-the final states are not connected to anything. This dangling outputs is
-problematic, so we explicitly connect each of them to a BlockGrad
-operator, which simply back propagates 0-gradient and closes the
-computation graph.
-
-In the end, we just group all the prediction outputs at each time step
-as a single SymbolicNode and return. Optionally we will also group the
-final states, this is used when we use the trained LSTM to sample
-sentences.
-
-Data Provider for Text Sequences
---------------------------------
-
-Now we need to construct a data provider that takes a text file, divide
-the text into mini-batches of fixed-length character-sequences, and
-provide them as one-hot encoded vectors.
-
-Note the is no fancy feature extraction at all. Each character is simply
-encoded as a one-hot vector: a 0-1 vector of the size given by the
-vocabulary. Here we just construct the vocabulary by collecting all the
-unique characters in the training text -- there are not too many of them
-(including punctuations and whitespace) for English text. Each input
-character is then encoded as a vector of 0s on all coordinates, and 1 on
-the coordinate corresponding to that character. The
-character-to-coordinate mapping is giving by the vocabulary.
-
-The text sequence data provider implements the [Data Providers](@ref) api. We define the `CharSeqProvider` as below:
-
-The provided data and labels follow the naming convention of inputs used
-when unrolling the LSTM. Note in the code below, apart from
-`$name_data_$t` and `$name_label_$t`, we also provides the initial `c`
-and `h` states for each layer. This is because we are using the
-high-level FeedForward API, which has no idea about time and states. So
-we will feed the initial states for each sequence from the data
-provider. Since the initial states is always zero, we just need to
-always provide constant zero blobs.
-
-Next we implement the `eachbatch` method from the [`mx.AbstractDataProvider`](@ref) interface for the
-provider. We start by defining the data and label arrays, and the
-`DataBatch` object we will provide in each iteration.
-
-The actual data providing iteration is implemented as a Julia
-**coroutine**. In this way, we can write the data loading logic as a
-simple coherent `for` loop, and do not need to implement the interface
-functions like Base.start, Base.next, etc.
-
-Basically, we partition the text into batches, each batch containing
-several contiguous text sequences. Note at each time step, the LSTM is
-trained to predict the next character, so the label is the same as the
-data, but shifted ahead by one index.
-
-Training the LSTM
------------------
-
-Now we have implemented all the supporting infrastructures for our
-char-lstm. To train the model, we just follow the standard high-level
-API. Firstly, we construct a LSTM symbolic architecture:
-
-Note all the parameters are defined in
-[examples/char-lstm/config.jl](https://github.com/apache/incubator-mxnet/blob/master/julia/examples/char-lstm/config.jl).
-Now we load the text file and define the data provider. The data
-`input.txt` we used in this example is [a tiny Shakespeare
-dataset](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare).
-But you can try with other text files.
-
-The last step is to construct a model, an optimizer and fit the mode to
-the data. We are using the ADAM optimizer \[Adam\]\_ in this example.
-
-Note we are also using a customized `NLL` evaluation metric, which
-calculate the negative log-likelihood during training. Here is an output
-sample at the end of the training process.
-
-```
-...
-INFO: Speed: 357.72 samples/sec
-INFO: == Epoch 020 ==========
-INFO: ## Training summary
-INFO:                NLL = 1.4672
-INFO:         perplexity = 4.3373
-INFO:               time = 87.2631 seconds
-INFO: ## Validation summary
-INFO:                NLL = 1.6374
-INFO:         perplexity = 5.1418
-INFO: Saved checkpoint to 'char-lstm/checkpoints/ptb-0020.params'
-INFO: Speed: 368.74 samples/sec
-INFO: Speed: 361.04 samples/sec
-INFO: Speed: 360.02 samples/sec
-INFO: Speed: 362.34 samples/sec
-INFO: Speed: 360.80 samples/sec
-INFO: Speed: 362.77 samples/sec
-INFO: Speed: 357.18 samples/sec
-INFO: Speed: 355.30 samples/sec
-INFO: Speed: 362.33 samples/sec
-INFO: Speed: 359.23 samples/sec
-INFO: Speed: 358.09 samples/sec
-INFO: Speed: 356.89 samples/sec
-INFO: Speed: 371.91 samples/sec
-INFO: Speed: 372.24 samples/sec
-INFO: Speed: 356.59 samples/sec
-INFO: Speed: 356.64 samples/sec
-INFO: Speed: 360.24 samples/sec
-INFO: Speed: 360.32 samples/sec
-INFO: Speed: 362.38 samples/sec
-INFO: == Epoch 021 ==========
-INFO: ## Training summary
-INFO:                NLL = 1.4655
-INFO:         perplexity = 4.3297
-INFO:               time = 86.9243 seconds
-INFO: ## Validation summary
-INFO:                NLL = 1.6366
-INFO:         perplexity = 5.1378
-INFO: Saved checkpoint to 'examples/char-lstm/checkpoints/ptb-0021.params'
-```
-
-Sampling Random Sentences
--------------------------
-
-After training the LSTM, we can now sample random sentences from the
-trained model. The sampler works in the following way:
-
--   Starting from some fixed character, take `a` for example, and feed
-    it as input to the LSTM.
--   The LSTM will produce an output distribution over the vocabulary and
-    a state in the first time step. We sample a character from the
-    output distribution, fix it as the second character.
--   In the next time step, we feed the previously sampled character as
-    input and continue running the LSTM by also taking the previous
-    states (instead of the 0 initial states).
--   Continue running until we sampled enough characters.
-
-Note we are running with mini-batches, so several sentences could be
-sampled simultaneously. Here are some sampled outputs from a network I
-trained for around half an hour on the Shakespeare dataset. Note all the
-line-breaks, punctuations and upper-lower case letters are produced by
-the sampler itself. I did not do any post-processing.
-
-```
-## Sample 1
-all have sir,
-Away will fill'd in His time, I'll keep her, do not madam, if they here? Some more ha?
-
-## Sample 2
-am.
-
-CLAUDIO:
-Hone here, let her, the remedge, and I know not slept a likely, thou some soully free?
-
-## Sample 3
-arrel which noble thing
-The exchnachsureding worns: I ne'er drunken Biancas, fairer, than the lawfu?
-
-## Sample 4
-augh assalu, you'ld tell me corn;
-Farew. First, for me of a loved. Has thereat I knock you presents?
-
-## Sample 5
-ame the first answer.
-
-MARIZARINIO:
-Door of Angelo as her lord, shrield liken Here fellow the fool ?
-
-## Sample 6
-ad well.
-
-CLAUDIO:
-Soon him a fellows here; for her fine edge in a bogms' lord's wife.
-
-LUCENTIO:
-I?
-
-## Sample 7
-adrezilian measure.
-
-LUCENTIO:
-So, help'd you hath nes have a than dream's corn, beautio, I perchas?
-
-## Sample 8
-as eatter me;
-The girlly: and no other conciolation!
-
-BISTRUMIO:
-I have be rest girl. O, that I a h?
-
-## Sample 9
-and is intend you sort:
-What held her all 'clama's for maffice. Some servant.' what I say me the cu?
-
-## Sample 10
-an thoughts will said in our pleasue,
-Not scanin on him that you live; believaries she.
-
-ISABELLLLL?
-```
-
-See [Andrej Karpathy's blog
-post](http://karpathy.github.io/2015/05/21/rnn-effectiveness/) on more
-examples and links including Linux source codes, Algebraic Geometry
-Theorems, and even cooking recipes. The code for sampling can be found
-in
-[examples/char-lstm/sampler.jl](https://github.com/apache/incubator-mxnet/tree/master/julia/examples/char-lstm/sampler.jl).
-
-Visualizing the LSTM
---------------------
-
-Finally, you could visualize the LSTM by calling to\_graphviz on the
-constructed LSTM symbolic architecture. We only show an example of
-1-layer and 2-time-step LSTM below. The automatic layout produced by
-GraphViz is definitely much less clear than [Christopher Olah's
-illustrations](http://colah.github.io/posts/2015-08-Understanding-LSTMs/),
-but could otherwise be very useful for debugging. As we can see, the
-LSTM unfolded over time is just a (very) deep neural network. The
-complete code for producing this visualization can be found in
-[examples/char-lstm/visualize.jl](https://github.com/apache/incubator-mxnet/blob/master/julia/examples/char-lstm/visualize.jl).
-
-![image](images/char-lstm-vis.svg)
diff --git a/julia/docs/src/tutorial/images/LSTM3-chain.png b/julia/docs/src/tutorial/images/LSTM3-chain.png
deleted file mode 100644
index e962a3c72078..000000000000
Binary files a/julia/docs/src/tutorial/images/LSTM3-chain.png and /dev/null differ
diff --git a/julia/docs/src/tutorial/images/char-lstm-vis.svg b/julia/docs/src/tutorial/images/char-lstm-vis.svg
deleted file mode 100644
index 610abab774b7..000000000000
--- a/julia/docs/src/tutorial/images/char-lstm-vis.svg
+++ /dev/null
@@ -1,435 +0,0 @@
-<?xml version="1.0" encoding="UTF-8" standalone="no"?>
-<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
- "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
-<!-- Generated by graphviz version 2.38.0 (20140413.2041)
- -->
-<!-- Title: Network Visualization Pages: 1 -->
-<svg width="606pt" height="1758pt"
- viewBox="0.00 0.00 606.00 1758.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
-<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 1754)">
-<title>Network Visualization</title>
-<polygon fill="white" stroke="none" points="-4,4 -4,-1754 602,-1754 602,4 -4,4"/>
-<!-- ptb_embed_2 -->
-<g id="node1" class="node"><title>ptb_embed_2</title>
-<path fill="#fb8072" stroke="#941305" stroke-width="2" d="M269,-810C269,-810 199,-810 199,-810 193,-810 187,-804 187,-798 187,-798 187,-764 187,-764 187,-758 193,-752 199,-752 199,-752 269,-752 269,-752 275,-752 281,-758 281,-764 281,-764 281,-798 281,-798 281,-804 275,-810 269,-810"/>
-<text text-anchor="middle" x="234" y="-788" font-family="Times,serif" font-size="10.00">ptb_embed_2</text>
-<text text-anchor="middle" x="234" y="-778" font-family="Times,serif" font-size="10.00">FullyConnected</text>
-<text text-anchor="middle" x="234" y="-768" font-family="Times,serif" font-size="10.00">num&#45;hidden=256</text>
-</g>
-<!-- ptb_lstm_2_i2h -->
-<g id="node2" class="node"><title>ptb_lstm_2_i2h</title>
-<path fill="#fb8072" stroke="#941305" stroke-width="2" d="M269,-904C269,-904 199,-904 199,-904 193,-904 187,-898 187,-892 187,-892 187,-858 187,-858 187,-852 193,-846 199,-846 199,-846 269,-846 269,-846 275,-846 281,-852 281,-858 281,-858 281,-892 281,-892 281,-898 275,-904 269,-904"/>
-<text text-anchor="middle" x="234" y="-882" font-family="Times,serif" font-size="10.00">ptb_lstm_2_i2h</text>
-<text text-anchor="middle" x="234" y="-872" font-family="Times,serif" font-size="10.00">FullyConnected</text>
-<text text-anchor="middle" x="234" y="-862" font-family="Times,serif" font-size="10.00">num&#45;hidden=1024</text>
-</g>
-<!-- ptb_lstm_2_i2h&#45;&gt;ptb_embed_2 -->
-<g id="edge1" class="edge"><title>ptb_lstm_2_i2h&#45;&gt;ptb_embed_2</title>
-<path fill="none" stroke="#737373" d="M234,-835.744C234,-827.204 234,-818.298 234,-810.248"/>
-<polygon fill="#737373" stroke="#737373" points="234,-845.897 229.5,-835.897 234,-840.897 234,-835.897 234,-835.897 234,-835.897 234,-840.897 238.5,-835.897 234,-845.897 234,-845.897"/>
-</g>
-<!-- ptb_embed_1 -->
-<g id="node3" class="node"><title>ptb_embed_1</title>
-<path fill="#fb8072" stroke="#941305" stroke-width="2" d="M194,-58C194,-58 124,-58 124,-58 118,-58 112,-52 112,-46 112,-46 112,-12 112,-12 112,-6 118,-0 124,-0 124,-0 194,-0 194,-0 200,-0 206,-6 206,-12 206,-12 206,-46 206,-46 206,-52 200,-58 194,-58"/>
-<text text-anchor="middle" x="159" y="-36" font-family="Times,serif" font-size="10.00">ptb_embed_1</text>
-<text text-anchor="middle" x="159" y="-26" font-family="Times,serif" font-size="10.00">FullyConnected</text>
-<text text-anchor="middle" x="159" y="-16" font-family="Times,serif" font-size="10.00">num&#45;hidden=256</text>
-</g>
-<!-- ptb_lstm_1_i2h -->
-<g id="node4" class="node"><title>ptb_lstm_1_i2h</title>
-<path fill="#fb8072" stroke="#941305" stroke-width="2" d="M194,-152C194,-152 124,-152 124,-152 118,-152 112,-146 112,-140 112,-140 112,-106 112,-106 112,-100 118,-94 124,-94 124,-94 194,-94 194,-94 200,-94 206,-100 206,-106 206,-106 206,-140 206,-140 206,-146 200,-152 194,-152"/>
-<text text-anchor="middle" x="159" y="-130" font-family="Times,serif" font-size="10.00">ptb_lstm_1_i2h</text>
-<text text-anchor="middle" x="159" y="-120" font-family="Times,serif" font-size="10.00">FullyConnected</text>
-<text text-anchor="middle" x="159" y="-110" font-family="Times,serif" font-size="10.00">num&#45;hidden=1024</text>
-</g>
-<!-- ptb_lstm_1_i2h&#45;&gt;ptb_embed_1 -->
-<g id="edge2" class="edge"><title>ptb_lstm_1_i2h&#45;&gt;ptb_embed_1</title>
-<path fill="none" stroke="#737373" d="M159,-83.7443C159,-75.2043 159,-66.2977 159,-58.2479"/>
-<polygon fill="#737373" stroke="#737373" points="159,-93.8971 154.5,-83.897 159,-88.8971 159,-83.8971 159,-83.8971 159,-83.8971 159,-88.8971 163.5,-83.8971 159,-93.8971 159,-93.8971"/>
-</g>
-<!-- ptb_lstm_1_h2h -->
-<g id="node5" class="node"><title>ptb_lstm_1_h2h</title>
-<path fill="#fb8072" stroke="#941305" stroke-width="2" d="M306,-152C306,-152 236,-152 236,-152 230,-152 224,-146 224,-140 224,-140 224,-106 224,-106 224,-100 230,-94 236,-94 236,-94 306,-94 306,-94 312,-94 318,-100 318,-106 318,-106 318,-140 318,-140 318,-146 312,-152 306,-152"/>
-<text text-anchor="middle" x="271" y="-130" font-family="Times,serif" font-size="10.00">ptb_lstm_1_h2h</text>
-<text text-anchor="middle" x="271" y="-120" font-family="Times,serif" font-size="10.00">FullyConnected</text>
-<text text-anchor="middle" x="271" y="-110" font-family="Times,serif" font-size="10.00">num&#45;hidden=1024</text>
-</g>
-<!-- _plus0 -->
-<g id="node6" class="node"><title>_plus0</title>
-<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M250,-246C250,-246 180,-246 180,-246 174,-246 168,-240 168,-234 168,-234 168,-200 168,-200 168,-194 174,-188 180,-188 180,-188 250,-188 250,-188 256,-188 262,-194 262,-200 262,-200 262,-234 262,-234 262,-240 256,-246 250,-246"/>
-<text text-anchor="middle" x="215" y="-219" font-family="Times,serif" font-size="10.00">_plus0</text>
-<text text-anchor="middle" x="215" y="-209" font-family="Times,serif" font-size="10.00">_Plus</text>
-</g>
-<!-- _plus0&#45;&gt;ptb_lstm_1_i2h -->
-<g id="edge3" class="edge"><title>_plus0&#45;&gt;ptb_lstm_1_i2h</title>
-<path fill="none" stroke="#737373" d="M192.569,-179.148C187.113,-170.186 181.363,-160.74 176.194,-152.248"/>
-<polygon fill="#737373" stroke="#737373" points="197.894,-187.897 188.85,-181.695 195.294,-183.626 192.694,-179.355 192.694,-179.355 192.694,-179.355 195.294,-183.626 196.538,-177.015 197.894,-187.897 197.894,-187.897"/>
-</g>
-<!-- _plus0&#45;&gt;ptb_lstm_1_h2h -->
-<g id="edge4" class="edge"><title>_plus0&#45;&gt;ptb_lstm_1_h2h</title>
-<path fill="none" stroke="#737373" d="M237.431,-179.148C242.887,-170.186 248.637,-160.74 253.806,-152.248"/>
-<polygon fill="#737373" stroke="#737373" points="232.106,-187.897 233.462,-177.015 234.706,-183.626 237.306,-179.355 237.306,-179.355 237.306,-179.355 234.706,-183.626 241.15,-181.695 232.106,-187.897 232.106,-187.897"/>
-</g>
-<!-- ptb_lstm_1_gates -->
-<g id="node7" class="node"><title>ptb_lstm_1_gates</title>
-<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M250,-340C250,-340 180,-340 180,-340 174,-340 168,-334 168,-328 168,-328 168,-294 168,-294 168,-288 174,-282 180,-282 180,-282 250,-282 250,-282 256,-282 262,-288 262,-294 262,-294 262,-328 262,-328 262,-334 256,-340 250,-340"/>
-<text text-anchor="middle" x="215" y="-313" font-family="Times,serif" font-size="10.00">ptb_lstm_1_gates</text>
-<text text-anchor="middle" x="215" y="-303" font-family="Times,serif" font-size="10.00">SliceChannel</text>
-</g>
-<!-- ptb_lstm_1_gates&#45;&gt;_plus0 -->
-<g id="edge5" class="edge"><title>ptb_lstm_1_gates&#45;&gt;_plus0</title>
-<path fill="none" stroke="#737373" d="M215,-271.744C215,-263.204 215,-254.298 215,-246.248"/>
-<polygon fill="#737373" stroke="#737373" points="215,-281.897 210.5,-271.897 215,-276.897 215,-271.897 215,-271.897 215,-271.897 215,-276.897 219.5,-271.897 215,-281.897 215,-281.897"/>
-</g>
-<!-- activation3 -->
-<g id="node8" class="node"><title>activation3</title>
-<path fill="#ffffb3" stroke="#999900" stroke-width="2" d="M381,-622C381,-622 311,-622 311,-622 305,-622 299,-616 299,-610 299,-610 299,-576 299,-576 299,-570 305,-564 311,-564 311,-564 381,-564 381,-564 387,-564 393,-570 393,-576 393,-576 393,-610 393,-610 393,-616 387,-622 381,-622"/>
-<text text-anchor="middle" x="346" y="-600" font-family="Times,serif" font-size="10.00">activation3</text>
-<text text-anchor="middle" x="346" y="-590" font-family="Times,serif" font-size="10.00">Activation</text>
-<text text-anchor="middle" x="346" y="-580" font-family="Times,serif" font-size="10.00">act&#45;type=sigmoid</text>
-</g>
-<!-- activation3&#45;&gt;ptb_lstm_1_gates -->
-<g id="edge6" class="edge"><title>activation3&#45;&gt;ptb_lstm_1_gates</title>
-<path fill="none" stroke="#737373" d="M352.67,-553.687C358.562,-507.677 361.566,-429.965 327,-376 312.142,-352.804 285.707,-337.124 262.249,-327.092"/>
-<polygon fill="#737373" stroke="#737373" points="351.256,-563.836 348.179,-553.311 351.946,-558.884 352.636,-553.932 352.636,-553.932 352.636,-553.932 351.946,-558.884 357.093,-554.552 351.256,-563.836 351.256,-563.836"/>
-</g>
-<!-- activation2 -->
-<g id="node9" class="node"><title>activation2</title>
-<path fill="#ffffb3" stroke="#999900" stroke-width="2" d="M82,-434C82,-434 12,-434 12,-434 6,-434 -7.10543e-15,-428 -7.10543e-15,-422 -7.10543e-15,-422 -7.10543e-15,-388 -7.10543e-15,-388 -7.10543e-15,-382 6,-376 12,-376 12,-376 82,-376 82,-376 88,-376 94,-382 94,-388 94,-388 94,-422 94,-422 94,-428 88,-434 82,-434"/>
-<text text-anchor="middle" x="47" y="-412" font-family="Times,serif" font-size="10.00">activation2</text>
-<text text-anchor="middle" x="47" y="-402" font-family="Times,serif" font-size="10.00">Activation</text>
-<text text-anchor="middle" x="47" y="-392" font-family="Times,serif" font-size="10.00">act&#45;type=sigmoid</text>
-</g>
-<!-- activation2&#45;&gt;ptb_lstm_1_gates -->
-<g id="edge7" class="edge"><title>activation2&#45;&gt;ptb_lstm_1_gates</title>
-<path fill="none" stroke="#737373" d="M103.067,-373.297C124.069,-361.795 147.701,-348.854 167.819,-337.837"/>
-<polygon fill="#737373" stroke="#737373" points="94.2444,-378.128 100.854,-369.378 98.6299,-375.726 103.015,-373.325 103.015,-373.325 103.015,-373.325 98.6299,-375.726 105.177,-377.272 94.2444,-378.128 94.2444,-378.128"/>
-</g>
-<!-- _mul0 -->
-<g id="node10" class="node"><title>_mul0</title>
-<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M82,-528C82,-528 12,-528 12,-528 6,-528 -7.10543e-15,-522 -7.10543e-15,-516 -7.10543e-15,-516 -7.10543e-15,-482 -7.10543e-15,-482 -7.10543e-15,-476 6,-470 12,-470 12,-470 82,-470 82,-470 88,-470 94,-476 94,-482 94,-482 94,-516 94,-516 94,-522 88,-528 82,-528"/>
-<text text-anchor="middle" x="47" y="-501" font-family="Times,serif" font-size="10.00">_mul0</text>
-<text text-anchor="middle" x="47" y="-491" font-family="Times,serif" font-size="10.00">_Mul</text>
-</g>
-<!-- _mul0&#45;&gt;activation2 -->
-<g id="edge8" class="edge"><title>_mul0&#45;&gt;activation2</title>
-<path fill="none" stroke="#737373" d="M47,-459.744C47,-451.204 47,-442.298 47,-434.248"/>
-<polygon fill="#737373" stroke="#737373" points="47,-469.897 42.5001,-459.897 47,-464.897 47.0001,-459.897 47.0001,-459.897 47.0001,-459.897 47,-464.897 51.5001,-459.897 47,-469.897 47,-469.897"/>
-</g>
-<!-- activation0 -->
-<g id="node11" class="node"><title>activation0</title>
-<path fill="#ffffb3" stroke="#999900" stroke-width="2" d="M194,-434C194,-434 124,-434 124,-434 118,-434 112,-428 112,-422 112,-422 112,-388 112,-388 112,-382 118,-376 124,-376 124,-376 194,-376 194,-376 200,-376 206,-382 206,-388 206,-388 206,-422 206,-422 206,-428 200,-434 194,-434"/>
-<text text-anchor="middle" x="159" y="-412" font-family="Times,serif" font-size="10.00">activation0</text>
-<text text-anchor="middle" x="159" y="-402" font-family="Times,serif" font-size="10.00">Activation</text>
-<text text-anchor="middle" x="159" y="-392" font-family="Times,serif" font-size="10.00">act&#45;type=sigmoid</text>
-</g>
-<!-- activation0&#45;&gt;ptb_lstm_1_gates -->
-<g id="edge9" class="edge"><title>activation0&#45;&gt;ptb_lstm_1_gates</title>
-<path fill="none" stroke="#737373" d="M181.431,-367.148C186.887,-358.186 192.637,-348.74 197.806,-340.248"/>
-<polygon fill="#737373" stroke="#737373" points="176.106,-375.897 177.462,-365.015 178.706,-371.626 181.306,-367.355 181.306,-367.355 181.306,-367.355 178.706,-371.626 185.15,-369.695 176.106,-375.897 176.106,-375.897"/>
-</g>
-<!-- activation1 -->
-<g id="node12" class="node"><title>activation1</title>
-<path fill="#ffffb3" stroke="#999900" stroke-width="2" d="M306,-434C306,-434 236,-434 236,-434 230,-434 224,-428 224,-422 224,-422 224,-388 224,-388 224,-382 230,-376 236,-376 236,-376 306,-376 306,-376 312,-376 318,-382 318,-388 318,-388 318,-422 318,-422 318,-428 312,-434 306,-434"/>
-<text text-anchor="middle" x="271" y="-412" font-family="Times,serif" font-size="10.00">activation1</text>
-<text text-anchor="middle" x="271" y="-402" font-family="Times,serif" font-size="10.00">Activation</text>
-<text text-anchor="middle" x="271" y="-392" font-family="Times,serif" font-size="10.00">act&#45;type=tanh</text>
-</g>
-<!-- activation1&#45;&gt;ptb_lstm_1_gates -->
-<g id="edge10" class="edge"><title>activation1&#45;&gt;ptb_lstm_1_gates</title>
-<path fill="none" stroke="#737373" d="M248.569,-367.148C243.113,-358.186 237.363,-348.74 232.194,-340.248"/>
-<polygon fill="#737373" stroke="#737373" points="253.894,-375.897 244.85,-369.695 251.294,-371.626 248.694,-367.355 248.694,-367.355 248.694,-367.355 251.294,-371.626 252.538,-365.015 253.894,-375.897 253.894,-375.897"/>
-</g>
-<!-- _mul1 -->
-<g id="node13" class="node"><title>_mul1</title>
-<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M194,-528C194,-528 124,-528 124,-528 118,-528 112,-522 112,-516 112,-516 112,-482 112,-482 112,-476 118,-470 124,-470 124,-470 194,-470 194,-470 200,-470 206,-476 206,-482 206,-482 206,-516 206,-516 206,-522 200,-528 194,-528"/>
-<text text-anchor="middle" x="159" y="-501" font-family="Times,serif" font-size="10.00">_mul1</text>
-<text text-anchor="middle" x="159" y="-491" font-family="Times,serif" font-size="10.00">_Mul</text>
-</g>
-<!-- _mul1&#45;&gt;activation0 -->
-<g id="edge11" class="edge"><title>_mul1&#45;&gt;activation0</title>
-<path fill="none" stroke="#737373" d="M159,-459.744C159,-451.204 159,-442.298 159,-434.248"/>
-<polygon fill="#737373" stroke="#737373" points="159,-469.897 154.5,-459.897 159,-464.897 159,-459.897 159,-459.897 159,-459.897 159,-464.897 163.5,-459.897 159,-469.897 159,-469.897"/>
-</g>
-<!-- _mul1&#45;&gt;activation1 -->
-<g id="edge12" class="edge"><title>_mul1&#45;&gt;activation1</title>
-<path fill="none" stroke="#737373" d="M201.148,-463.378C212.849,-453.767 225.411,-443.448 236.611,-434.248"/>
-<polygon fill="#737373" stroke="#737373" points="193.212,-469.897 198.083,-460.072 197.076,-466.723 200.94,-463.55 200.94,-463.55 200.94,-463.55 197.076,-466.723 203.796,-467.027 193.212,-469.897 193.212,-469.897"/>
-</g>
-<!-- _plus1 -->
-<g id="node14" class="node"><title>_plus1</title>
-<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M194,-622C194,-622 124,-622 124,-622 118,-622 112,-616 112,-610 112,-610 112,-576 112,-576 112,-570 118,-564 124,-564 124,-564 194,-564 194,-564 200,-564 206,-570 206,-576 206,-576 206,-610 206,-610 206,-616 200,-622 194,-622"/>
-<text text-anchor="middle" x="159" y="-595" font-family="Times,serif" font-size="10.00">_plus1</text>
-<text text-anchor="middle" x="159" y="-585" font-family="Times,serif" font-size="10.00">_Plus</text>
-</g>
-<!-- _plus1&#45;&gt;_mul0 -->
-<g id="edge13" class="edge"><title>_plus1&#45;&gt;_mul0</title>
-<path fill="none" stroke="#737373" d="M116.852,-557.378C105.151,-547.767 92.5885,-537.448 81.3887,-528.248"/>
-<polygon fill="#737373" stroke="#737373" points="124.788,-563.897 114.204,-561.027 120.924,-560.723 117.06,-557.55 117.06,-557.55 117.06,-557.55 120.924,-560.723 119.917,-554.072 124.788,-563.897 124.788,-563.897"/>
-</g>
-<!-- _plus1&#45;&gt;_mul1 -->
-<g id="edge14" class="edge"><title>_plus1&#45;&gt;_mul1</title>
-<path fill="none" stroke="#737373" d="M159,-553.744C159,-545.204 159,-536.298 159,-528.248"/>
-<polygon fill="#737373" stroke="#737373" points="159,-563.897 154.5,-553.897 159,-558.897 159,-553.897 159,-553.897 159,-553.897 159,-558.897 163.5,-553.897 159,-563.897 159,-563.897"/>
-</g>
-<!-- activation4 -->
-<g id="node15" class="node"><title>activation4</title>
-<path fill="#ffffb3" stroke="#999900" stroke-width="2" d="M288,-716C288,-716 218,-716 218,-716 212,-716 206,-710 206,-704 206,-704 206,-670 206,-670 206,-664 212,-658 218,-658 218,-658 288,-658 288,-658 294,-658 300,-664 300,-670 300,-670 300,-704 300,-704 300,-710 294,-716 288,-716"/>
-<text text-anchor="middle" x="253" y="-694" font-family="Times,serif" font-size="10.00">activation4</text>
-<text text-anchor="middle" x="253" y="-684" font-family="Times,serif" font-size="10.00">Activation</text>
-<text text-anchor="middle" x="253" y="-674" font-family="Times,serif" font-size="10.00">act&#45;type=tanh</text>
-</g>
-<!-- activation4&#45;&gt;_plus1 -->
-<g id="edge15" class="edge"><title>activation4&#45;&gt;_plus1</title>
-<path fill="none" stroke="#737373" d="M217.058,-650.823C207.4,-641.37 197.081,-631.271 187.862,-622.248"/>
-<polygon fill="#737373" stroke="#737373" points="224.286,-657.897 213.992,-654.118 220.713,-654.4 217.139,-650.902 217.139,-650.902 217.139,-650.902 220.713,-654.4 220.287,-647.686 224.286,-657.897 224.286,-657.897"/>
-</g>
-<!-- _mul2 -->
-<g id="node16" class="node"><title>_mul2</title>
-<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M381,-810C381,-810 311,-810 311,-810 305,-810 299,-804 299,-798 299,-798 299,-764 299,-764 299,-758 305,-752 311,-752 311,-752 381,-752 381,-752 387,-752 393,-758 393,-764 393,-764 393,-798 393,-798 393,-804 387,-810 381,-810"/>
-<text text-anchor="middle" x="346" y="-783" font-family="Times,serif" font-size="10.00">_mul2</text>
-<text text-anchor="middle" x="346" y="-773" font-family="Times,serif" font-size="10.00">_Mul</text>
-</g>
-<!-- _mul2&#45;&gt;activation3 -->
-<g id="edge16" class="edge"><title>_mul2&#45;&gt;activation3</title>
-<path fill="none" stroke="#737373" d="M346,-741.746C346,-706.206 346,-654.104 346,-622.21"/>
-<polygon fill="#737373" stroke="#737373" points="346,-751.751 341.5,-741.751 346,-746.751 346,-741.751 346,-741.751 346,-741.751 346,-746.751 350.5,-741.751 346,-751.751 346,-751.751"/>
-</g>
-<!-- _mul2&#45;&gt;activation4 -->
-<g id="edge17" class="edge"><title>_mul2&#45;&gt;activation4</title>
-<path fill="none" stroke="#737373" d="M310.441,-744.823C300.885,-735.37 290.676,-725.271 281.555,-716.248"/>
-<polygon fill="#737373" stroke="#737373" points="317.592,-751.897 307.318,-748.063 314.037,-748.381 310.482,-744.864 310.482,-744.864 310.482,-744.864 314.037,-748.381 313.647,-741.665 317.592,-751.897 317.592,-751.897"/>
-</g>
-<!-- ptb_lstm_2_h2h -->
-<g id="node17" class="node"><title>ptb_lstm_2_h2h</title>
-<path fill="#fb8072" stroke="#941305" stroke-width="2" d="M381,-904C381,-904 311,-904 311,-904 305,-904 299,-898 299,-892 299,-892 299,-858 299,-858 299,-852 305,-846 311,-846 311,-846 381,-846 381,-846 387,-846 393,-852 393,-858 393,-858 393,-892 393,-892 393,-898 387,-904 381,-904"/>
-<text text-anchor="middle" x="346" y="-882" font-family="Times,serif" font-size="10.00">ptb_lstm_2_h2h</text>
-<text text-anchor="middle" x="346" y="-872" font-family="Times,serif" font-size="10.00">FullyConnected</text>
-<text text-anchor="middle" x="346" y="-862" font-family="Times,serif" font-size="10.00">num&#45;hidden=1024</text>
-</g>
-<!-- ptb_lstm_2_h2h&#45;&gt;_mul2 -->
-<g id="edge18" class="edge"><title>ptb_lstm_2_h2h&#45;&gt;_mul2</title>
-<path fill="none" stroke="#737373" d="M346,-835.744C346,-827.204 346,-818.298 346,-810.248"/>
-<polygon fill="#737373" stroke="#737373" points="346,-845.897 341.5,-835.897 346,-840.897 346,-835.897 346,-835.897 346,-835.897 346,-840.897 350.5,-835.897 346,-845.897 346,-845.897"/>
-</g>
-<!-- _plus2 -->
-<g id="node18" class="node"><title>_plus2</title>
-<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M381,-998C381,-998 311,-998 311,-998 305,-998 299,-992 299,-986 299,-986 299,-952 299,-952 299,-946 305,-940 311,-940 311,-940 381,-940 381,-940 387,-940 393,-946 393,-952 393,-952 393,-986 393,-986 393,-992 387,-998 381,-998"/>
-<text text-anchor="middle" x="346" y="-971" font-family="Times,serif" font-size="10.00">_plus2</text>
-<text text-anchor="middle" x="346" y="-961" font-family="Times,serif" font-size="10.00">_Plus</text>
-</g>
-<!-- _plus2&#45;&gt;ptb_lstm_2_i2h -->
-<g id="edge19" class="edge"><title>_plus2&#45;&gt;ptb_lstm_2_i2h</title>
-<path fill="none" stroke="#737373" d="M303.852,-933.378C292.151,-923.767 279.589,-913.448 268.389,-904.248"/>
-<polygon fill="#737373" stroke="#737373" points="311.788,-939.897 301.204,-937.027 307.924,-936.723 304.06,-933.55 304.06,-933.55 304.06,-933.55 307.924,-936.723 306.917,-930.072 311.788,-939.897 311.788,-939.897"/>
-</g>
-<!-- _plus2&#45;&gt;ptb_lstm_2_h2h -->
-<g id="edge20" class="edge"><title>_plus2&#45;&gt;ptb_lstm_2_h2h</title>
-<path fill="none" stroke="#737373" d="M346,-929.744C346,-921.204 346,-912.298 346,-904.248"/>
-<polygon fill="#737373" stroke="#737373" points="346,-939.897 341.5,-929.897 346,-934.897 346,-929.897 346,-929.897 346,-929.897 346,-934.897 350.5,-929.897 346,-939.897 346,-939.897"/>
-</g>
-<!-- ptb_lstm_2_gates -->
-<g id="node19" class="node"><title>ptb_lstm_2_gates</title>
-<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M381,-1092C381,-1092 311,-1092 311,-1092 305,-1092 299,-1086 299,-1080 299,-1080 299,-1046 299,-1046 299,-1040 305,-1034 311,-1034 311,-1034 381,-1034 381,-1034 387,-1034 393,-1040 393,-1046 393,-1046 393,-1080 393,-1080 393,-1086 387,-1092 381,-1092"/>
-<text text-anchor="middle" x="346" y="-1065" font-family="Times,serif" font-size="10.00">ptb_lstm_2_gates</text>
-<text text-anchor="middle" x="346" y="-1055" font-family="Times,serif" font-size="10.00">SliceChannel</text>
-</g>
-<!-- ptb_lstm_2_gates&#45;&gt;_plus2 -->
-<g id="edge21" class="edge"><title>ptb_lstm_2_gates&#45;&gt;_plus2</title>
-<path fill="none" stroke="#737373" d="M346,-1023.74C346,-1015.2 346,-1006.3 346,-998.248"/>
-<polygon fill="#737373" stroke="#737373" points="346,-1033.9 341.5,-1023.9 346,-1028.9 346,-1023.9 346,-1023.9 346,-1023.9 346,-1028.9 350.5,-1023.9 346,-1033.9 346,-1033.9"/>
-</g>
-<!-- activation8 -->
-<g id="node20" class="node"><title>activation8</title>
-<path fill="#ffffb3" stroke="#999900" stroke-width="2" d="M568,-1374C568,-1374 498,-1374 498,-1374 492,-1374 486,-1368 486,-1362 486,-1362 486,-1328 486,-1328 486,-1322 492,-1316 498,-1316 498,-1316 568,-1316 568,-1316 574,-1316 580,-1322 580,-1328 580,-1328 580,-1362 580,-1362 580,-1368 574,-1374 568,-1374"/>
-<text text-anchor="middle" x="533" y="-1352" font-family="Times,serif" font-size="10.00">activation8</text>
-<text text-anchor="middle" x="533" y="-1342" font-family="Times,serif" font-size="10.00">Activation</text>
-<text text-anchor="middle" x="533" y="-1332" font-family="Times,serif" font-size="10.00">act&#45;type=sigmoid</text>
-</g>
-<!-- activation8&#45;&gt;ptb_lstm_2_gates -->
-<g id="edge22" class="edge"><title>activation8&#45;&gt;ptb_lstm_2_gates</title>
-<path fill="none" stroke="#737373" d="M540.878,-1305.79C548.226,-1258.96 552.937,-1179.51 514,-1128 485.58,-1090.4 432.237,-1074.89 393.26,-1068.49"/>
-<polygon fill="#737373" stroke="#737373" points="539.183,-1315.83 536.411,-1305.22 540.015,-1310.9 540.848,-1305.97 540.848,-1305.97 540.848,-1305.97 540.015,-1310.9 545.285,-1306.72 539.183,-1315.83 539.183,-1315.83"/>
-</g>
-<!-- activation7 -->
-<g id="node21" class="node"><title>activation7</title>
-<path fill="#ffffb3" stroke="#999900" stroke-width="2" d="M269,-1186C269,-1186 199,-1186 199,-1186 193,-1186 187,-1180 187,-1174 187,-1174 187,-1140 187,-1140 187,-1134 193,-1128 199,-1128 199,-1128 269,-1128 269,-1128 275,-1128 281,-1134 281,-1140 281,-1140 281,-1174 281,-1174 281,-1180 275,-1186 269,-1186"/>
-<text text-anchor="middle" x="234" y="-1164" font-family="Times,serif" font-size="10.00">activation7</text>
-<text text-anchor="middle" x="234" y="-1154" font-family="Times,serif" font-size="10.00">Activation</text>
-<text text-anchor="middle" x="234" y="-1144" font-family="Times,serif" font-size="10.00">act&#45;type=sigmoid</text>
-</g>
-<!-- activation7&#45;&gt;ptb_lstm_2_gates -->
-<g id="edge23" class="edge"><title>activation7&#45;&gt;ptb_lstm_2_gates</title>
-<path fill="none" stroke="#737373" d="M276.148,-1121.38C287.849,-1111.77 300.411,-1101.45 311.611,-1092.25"/>
-<polygon fill="#737373" stroke="#737373" points="268.212,-1127.9 273.083,-1118.07 272.076,-1124.72 275.94,-1121.55 275.94,-1121.55 275.94,-1121.55 272.076,-1124.72 278.796,-1125.03 268.212,-1127.9 268.212,-1127.9"/>
-</g>
-<!-- _mul3 -->
-<g id="node22" class="node"><title>_mul3</title>
-<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M269,-1280C269,-1280 199,-1280 199,-1280 193,-1280 187,-1274 187,-1268 187,-1268 187,-1234 187,-1234 187,-1228 193,-1222 199,-1222 199,-1222 269,-1222 269,-1222 275,-1222 281,-1228 281,-1234 281,-1234 281,-1268 281,-1268 281,-1274 275,-1280 269,-1280"/>
-<text text-anchor="middle" x="234" y="-1253" font-family="Times,serif" font-size="10.00">_mul3</text>
-<text text-anchor="middle" x="234" y="-1243" font-family="Times,serif" font-size="10.00">_Mul</text>
-</g>
-<!-- _mul3&#45;&gt;_plus1 -->
-<g id="edge25" class="edge"><title>_mul3&#45;&gt;_plus1</title>
-<path fill="none" stroke="#737373" d="M196.635,-1214.32C189.434,-1205.69 182.685,-1196.04 178,-1186 154.792,-1136.27 159,-1118.88 159,-1064 159,-1064 159,-1064 159,-780 159,-723.974 159,-658.455 159,-622.045"/>
-<polygon fill="#737373" stroke="#737373" points="203.329,-1221.94 193.348,-1217.4 200.029,-1218.18 196.729,-1214.43 196.729,-1214.43 196.729,-1214.43 200.029,-1218.18 200.11,-1211.46 203.329,-1221.94 203.329,-1221.94"/>
-</g>
-<!-- _mul3&#45;&gt;activation7 -->
-<g id="edge24" class="edge"><title>_mul3&#45;&gt;activation7</title>
-<path fill="none" stroke="#737373" d="M234,-1211.74C234,-1203.2 234,-1194.3 234,-1186.25"/>
-<polygon fill="#737373" stroke="#737373" points="234,-1221.9 229.5,-1211.9 234,-1216.9 234,-1211.9 234,-1211.9 234,-1211.9 234,-1216.9 238.5,-1211.9 234,-1221.9 234,-1221.9"/>
-</g>
-<!-- activation5 -->
-<g id="node23" class="node"><title>activation5</title>
-<path fill="#ffffb3" stroke="#999900" stroke-width="2" d="M381,-1186C381,-1186 311,-1186 311,-1186 305,-1186 299,-1180 299,-1174 299,-1174 299,-1140 299,-1140 299,-1134 305,-1128 311,-1128 311,-1128 381,-1128 381,-1128 387,-1128 393,-1134 393,-1140 393,-1140 393,-1174 393,-1174 393,-1180 387,-1186 381,-1186"/>
-<text text-anchor="middle" x="346" y="-1164" font-family="Times,serif" font-size="10.00">activation5</text>
-<text text-anchor="middle" x="346" y="-1154" font-family="Times,serif" font-size="10.00">Activation</text>
-<text text-anchor="middle" x="346" y="-1144" font-family="Times,serif" font-size="10.00">act&#45;type=sigmoid</text>
-</g>
-<!-- activation5&#45;&gt;ptb_lstm_2_gates -->
-<g id="edge26" class="edge"><title>activation5&#45;&gt;ptb_lstm_2_gates</title>
-<path fill="none" stroke="#737373" d="M346,-1117.74C346,-1109.2 346,-1100.3 346,-1092.25"/>
-<polygon fill="#737373" stroke="#737373" points="346,-1127.9 341.5,-1117.9 346,-1122.9 346,-1117.9 346,-1117.9 346,-1117.9 346,-1122.9 350.5,-1117.9 346,-1127.9 346,-1127.9"/>
-</g>
-<!-- activation6 -->
-<g id="node24" class="node"><title>activation6</title>
-<path fill="#ffffb3" stroke="#999900" stroke-width="2" d="M493,-1186C493,-1186 423,-1186 423,-1186 417,-1186 411,-1180 411,-1174 411,-1174 411,-1140 411,-1140 411,-1134 417,-1128 423,-1128 423,-1128 493,-1128 493,-1128 499,-1128 505,-1134 505,-1140 505,-1140 505,-1174 505,-1174 505,-1180 499,-1186 493,-1186"/>
-<text text-anchor="middle" x="458" y="-1164" font-family="Times,serif" font-size="10.00">activation6</text>
-<text text-anchor="middle" x="458" y="-1154" font-family="Times,serif" font-size="10.00">Activation</text>
-<text text-anchor="middle" x="458" y="-1144" font-family="Times,serif" font-size="10.00">act&#45;type=tanh</text>
-</g>
-<!-- activation6&#45;&gt;ptb_lstm_2_gates -->
-<g id="edge27" class="edge"><title>activation6&#45;&gt;ptb_lstm_2_gates</title>
-<path fill="none" stroke="#737373" d="M415.852,-1121.38C404.151,-1111.77 391.589,-1101.45 380.389,-1092.25"/>
-<polygon fill="#737373" stroke="#737373" points="423.788,-1127.9 413.204,-1125.03 419.924,-1124.72 416.06,-1121.55 416.06,-1121.55 416.06,-1121.55 419.924,-1124.72 418.917,-1118.07 423.788,-1127.9 423.788,-1127.9"/>
-</g>
-<!-- _mul4 -->
-<g id="node25" class="node"><title>_mul4</title>
-<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M381,-1280C381,-1280 311,-1280 311,-1280 305,-1280 299,-1274 299,-1268 299,-1268 299,-1234 299,-1234 299,-1228 305,-1222 311,-1222 311,-1222 381,-1222 381,-1222 387,-1222 393,-1228 393,-1234 393,-1234 393,-1268 393,-1268 393,-1274 387,-1280 381,-1280"/>
-<text text-anchor="middle" x="346" y="-1253" font-family="Times,serif" font-size="10.00">_mul4</text>
-<text text-anchor="middle" x="346" y="-1243" font-family="Times,serif" font-size="10.00">_Mul</text>
-</g>
-<!-- _mul4&#45;&gt;activation5 -->
-<g id="edge28" class="edge"><title>_mul4&#45;&gt;activation5</title>
-<path fill="none" stroke="#737373" d="M346,-1211.74C346,-1203.2 346,-1194.3 346,-1186.25"/>
-<polygon fill="#737373" stroke="#737373" points="346,-1221.9 341.5,-1211.9 346,-1216.9 346,-1211.9 346,-1211.9 346,-1211.9 346,-1216.9 350.5,-1211.9 346,-1221.9 346,-1221.9"/>
-</g>
-<!-- _mul4&#45;&gt;activation6 -->
-<g id="edge29" class="edge"><title>_mul4&#45;&gt;activation6</title>
-<path fill="none" stroke="#737373" d="M388.148,-1215.38C399.849,-1205.77 412.411,-1195.45 423.611,-1186.25"/>
-<polygon fill="#737373" stroke="#737373" points="380.212,-1221.9 385.083,-1212.07 384.076,-1218.72 387.94,-1215.55 387.94,-1215.55 387.94,-1215.55 384.076,-1218.72 390.796,-1219.03 380.212,-1221.9 380.212,-1221.9"/>
-</g>
-<!-- _plus3 -->
-<g id="node26" class="node"><title>_plus3</title>
-<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M381,-1374C381,-1374 311,-1374 311,-1374 305,-1374 299,-1368 299,-1362 299,-1362 299,-1328 299,-1328 299,-1322 305,-1316 311,-1316 311,-1316 381,-1316 381,-1316 387,-1316 393,-1322 393,-1328 393,-1328 393,-1362 393,-1362 393,-1368 387,-1374 381,-1374"/>
-<text text-anchor="middle" x="346" y="-1347" font-family="Times,serif" font-size="10.00">_plus3</text>
-<text text-anchor="middle" x="346" y="-1337" font-family="Times,serif" font-size="10.00">_Plus</text>
-</g>
-<!-- _plus3&#45;&gt;_mul3 -->
-<g id="edge30" class="edge"><title>_plus3&#45;&gt;_mul3</title>
-<path fill="none" stroke="#737373" d="M303.852,-1309.38C292.151,-1299.77 279.589,-1289.45 268.389,-1280.25"/>
-<polygon fill="#737373" stroke="#737373" points="311.788,-1315.9 301.204,-1313.03 307.924,-1312.72 304.06,-1309.55 304.06,-1309.55 304.06,-1309.55 307.924,-1312.72 306.917,-1306.07 311.788,-1315.9 311.788,-1315.9"/>
-</g>
-<!-- _plus3&#45;&gt;_mul4 -->
-<g id="edge31" class="edge"><title>_plus3&#45;&gt;_mul4</title>
-<path fill="none" stroke="#737373" d="M346,-1305.74C346,-1297.2 346,-1288.3 346,-1280.25"/>
-<polygon fill="#737373" stroke="#737373" points="346,-1315.9 341.5,-1305.9 346,-1310.9 346,-1305.9 346,-1305.9 346,-1305.9 346,-1310.9 350.5,-1305.9 346,-1315.9 346,-1315.9"/>
-</g>
-<!-- activation9 -->
-<g id="node27" class="node"><title>activation9</title>
-<path fill="#ffffb3" stroke="#999900" stroke-width="2" d="M493,-1468C493,-1468 423,-1468 423,-1468 417,-1468 411,-1462 411,-1456 411,-1456 411,-1422 411,-1422 411,-1416 417,-1410 423,-1410 423,-1410 493,-1410 493,-1410 499,-1410 505,-1416 505,-1422 505,-1422 505,-1456 505,-1456 505,-1462 499,-1468 493,-1468"/>
-<text text-anchor="middle" x="458" y="-1446" font-family="Times,serif" font-size="10.00">activation9</text>
-<text text-anchor="middle" x="458" y="-1436" font-family="Times,serif" font-size="10.00">Activation</text>
-<text text-anchor="middle" x="458" y="-1426" font-family="Times,serif" font-size="10.00">act&#45;type=tanh</text>
-</g>
-<!-- activation9&#45;&gt;_plus3 -->
-<g id="edge32" class="edge"><title>activation9&#45;&gt;_plus3</title>
-<path fill="none" stroke="#737373" d="M415.852,-1403.38C404.151,-1393.77 391.589,-1383.45 380.389,-1374.25"/>
-<polygon fill="#737373" stroke="#737373" points="423.788,-1409.9 413.204,-1407.03 419.924,-1406.72 416.06,-1403.55 416.06,-1403.55 416.06,-1403.55 419.924,-1406.72 418.917,-1400.07 423.788,-1409.9 423.788,-1409.9"/>
-</g>
-<!-- _mul5 -->
-<g id="node28" class="node"><title>_mul5</title>
-<path fill="#fccde5" stroke="#90094e" stroke-width="2" d="M530,-1562C530,-1562 460,-1562 460,-1562 454,-1562 448,-1556 448,-1550 448,-1550 448,-1516 448,-1516 448,-1510 454,-1504 460,-1504 460,-1504 530,-1504 530,-1504 536,-1504 542,-1510 542,-1516 542,-1516 542,-1550 542,-1550 542,-1556 536,-1562 530,-1562"/>
-<text text-anchor="middle" x="495" y="-1535" font-family="Times,serif" font-size="10.00">_mul5</text>
-<text text-anchor="middle" x="495" y="-1525" font-family="Times,serif" font-size="10.00">_Mul</text>
-</g>
-<!-- _mul5&#45;&gt;activation8 -->
-<g id="edge33" class="edge"><title>_mul5&#45;&gt;activation8</title>
-<path fill="none" stroke="#737373" d="M507.152,-1494.14C509.638,-1485.6 512.078,-1476.53 514,-1468 521.159,-1436.21 526.515,-1399.16 529.685,-1374.31"/>
-<polygon fill="#737373" stroke="#737373" points="504.242,-1503.86 502.799,-1492.99 505.676,-1499.07 507.11,-1494.28 507.11,-1494.28 507.11,-1494.28 505.676,-1499.07 511.421,-1495.57 504.242,-1503.86 504.242,-1503.86"/>
-</g>
-<!-- _mul5&#45;&gt;activation9 -->
-<g id="edge34" class="edge"><title>_mul5&#45;&gt;activation9</title>
-<path fill="none" stroke="#737373" d="M479.954,-1494.59C476.417,-1485.79 472.705,-1476.56 469.361,-1468.25"/>
-<polygon fill="#737373" stroke="#737373" points="483.698,-1503.9 475.791,-1496.3 481.832,-1499.26 479.966,-1494.62 479.966,-1494.62 479.966,-1494.62 481.832,-1499.26 484.141,-1492.94 483.698,-1503.9 483.698,-1503.9"/>
-</g>
-<!-- ptb_l1_last_h -->
-<g id="node29" class="node"><title>ptb_l1_last_h</title>
-<path fill="#b3de69" stroke="#597d1c" stroke-width="2" d="M474,-1656C474,-1656 404,-1656 404,-1656 398,-1656 392,-1650 392,-1644 392,-1644 392,-1610 392,-1610 392,-1604 398,-1598 404,-1598 404,-1598 474,-1598 474,-1598 480,-1598 486,-1604 486,-1610 486,-1610 486,-1644 486,-1644 486,-1650 480,-1656 474,-1656"/>
-<text text-anchor="middle" x="439" y="-1629" font-family="Times,serif" font-size="10.00">ptb_l1_last_h</text>
-<text text-anchor="middle" x="439" y="-1619" font-family="Times,serif" font-size="10.00">BlockGrad</text>
-</g>
-<!-- ptb_l1_last_h&#45;&gt;_mul5 -->
-<g id="edge35" class="edge"><title>ptb_l1_last_h&#45;&gt;_mul5</title>
-<path fill="none" stroke="#737373" d="M461.431,-1589.15C466.887,-1580.19 472.637,-1570.74 477.806,-1562.25"/>
-<polygon fill="#737373" stroke="#737373" points="456.106,-1597.9 457.462,-1587.02 458.706,-1593.63 461.306,-1589.36 461.306,-1589.36 461.306,-1589.36 458.706,-1593.63 465.15,-1591.69 456.106,-1597.9 456.106,-1597.9"/>
-</g>
-<!-- ptb_l1_last_c -->
-<g id="node30" class="node"><title>ptb_l1_last_c</title>
-<path fill="#b3de69" stroke="#597d1c" stroke-width="2" d="M381,-1468C381,-1468 311,-1468 311,-1468 305,-1468 299,-1462 299,-1456 299,-1456 299,-1422 299,-1422 299,-1416 305,-1410 311,-1410 311,-1410 381,-1410 381,-1410 387,-1410 393,-1416 393,-1422 393,-1422 393,-1456 393,-1456 393,-1462 387,-1468 381,-1468"/>
-<text text-anchor="middle" x="346" y="-1441" font-family="Times,serif" font-size="10.00">ptb_l1_last_c</text>
-<text text-anchor="middle" x="346" y="-1431" font-family="Times,serif" font-size="10.00">BlockGrad</text>
-</g>
-<!-- ptb_l1_last_c&#45;&gt;_plus3 -->
-<g id="edge36" class="edge"><title>ptb_l1_last_c&#45;&gt;_plus3</title>
-<path fill="none" stroke="#737373" d="M346,-1399.74C346,-1391.2 346,-1382.3 346,-1374.25"/>
-<polygon fill="#737373" stroke="#737373" points="346,-1409.9 341.5,-1399.9 346,-1404.9 346,-1399.9 346,-1399.9 346,-1399.9 346,-1404.9 350.5,-1399.9 346,-1409.9 346,-1409.9"/>
-</g>
-<!-- ptb_pred_2 -->
-<g id="node31" class="node"><title>ptb_pred_2</title>
-<path fill="#fb8072" stroke="#941305" stroke-width="2" d="M586,-1656C586,-1656 516,-1656 516,-1656 510,-1656 504,-1650 504,-1644 504,-1644 504,-1610 504,-1610 504,-1604 510,-1598 516,-1598 516,-1598 586,-1598 586,-1598 592,-1598 598,-1604 598,-1610 598,-1610 598,-1644 598,-1644 598,-1650 592,-1656 586,-1656"/>
-<text text-anchor="middle" x="551" y="-1634" font-family="Times,serif" font-size="10.00">ptb_pred_2</text>
-<text text-anchor="middle" x="551" y="-1624" font-family="Times,serif" font-size="10.00">FullyConnected</text>
-<text text-anchor="middle" x="551" y="-1614" font-family="Times,serif" font-size="10.00">num&#45;hidden=128</text>
-</g>
-<!-- ptb_pred_2&#45;&gt;_mul5 -->
-<g id="edge37" class="edge"><title>ptb_pred_2&#45;&gt;_mul5</title>
-<path fill="none" stroke="#737373" d="M528.569,-1589.15C523.113,-1580.19 517.363,-1570.74 512.194,-1562.25"/>
-<polygon fill="#737373" stroke="#737373" points="533.894,-1597.9 524.85,-1591.69 531.294,-1593.63 528.694,-1589.36 528.694,-1589.36 528.694,-1589.36 531.294,-1593.63 532.538,-1587.02 533.894,-1597.9 533.894,-1597.9"/>
-</g>
-<!-- ptb_softmax_2 -->
-<g id="node32" class="node"><title>ptb_softmax_2</title>
-<path fill="#b3de69" stroke="#597d1c" stroke-width="2" d="M586,-1750C586,-1750 516,-1750 516,-1750 510,-1750 504,-1744 504,-1738 504,-1738 504,-1704 504,-1704 504,-1698 510,-1692 516,-1692 516,-1692 586,-1692 586,-1692 592,-1692 598,-1698 598,-1704 598,-1704 598,-1738 598,-1738 598,-1744 592,-1750 586,-1750"/>
-<text text-anchor="middle" x="551" y="-1723" font-family="Times,serif" font-size="10.00">ptb_softmax_2</text>
-<text text-anchor="middle" x="551" y="-1713" font-family="Times,serif" font-size="10.00">SoftmaxOutput</text>
-</g>
-<!-- ptb_softmax_2&#45;&gt;ptb_pred_2 -->
-<g id="edge38" class="edge"><title>ptb_softmax_2&#45;&gt;ptb_pred_2</title>
-<path fill="none" stroke="#737373" d="M551,-1681.74C551,-1673.2 551,-1664.3 551,-1656.25"/>
-<polygon fill="#737373" stroke="#737373" points="551,-1691.9 546.5,-1681.9 551,-1686.9 551,-1681.9 551,-1681.9 551,-1681.9 551,-1686.9 555.5,-1681.9 551,-1691.9 551,-1691.9"/>
-</g>
-<!-- ptb_pred_1 -->
-<g id="node33" class="node"><title>ptb_pred_1</title>
-<path fill="#fb8072" stroke="#941305" stroke-width="2" d="M493,-904C493,-904 423,-904 423,-904 417,-904 411,-898 411,-892 411,-892 411,-858 411,-858 411,-852 417,-846 423,-846 423,-846 493,-846 493,-846 499,-846 505,-852 505,-858 505,-858 505,-892 505,-892 505,-898 499,-904 493,-904"/>
-<text text-anchor="middle" x="458" y="-882" font-family="Times,serif" font-size="10.00">ptb_pred_1</text>
-<text text-anchor="middle" x="458" y="-872" font-family="Times,serif" font-size="10.00">FullyConnected</text>
-<text text-anchor="middle" x="458" y="-862" font-family="Times,serif" font-size="10.00">num&#45;hidden=128</text>
-</g>
-<!-- ptb_pred_1&#45;&gt;_mul2 -->
-<g id="edge39" class="edge"><title>ptb_pred_1&#45;&gt;_mul2</title>
-<path fill="none" stroke="#737373" d="M415.852,-839.378C404.151,-829.767 391.589,-819.448 380.389,-810.248"/>
-<polygon fill="#737373" stroke="#737373" points="423.788,-845.897 413.204,-843.027 419.924,-842.723 416.06,-839.55 416.06,-839.55 416.06,-839.55 419.924,-842.723 418.917,-836.072 423.788,-845.897 423.788,-845.897"/>
-</g>
-<!-- ptb_softmax_1 -->
-<g id="node34" class="node"><title>ptb_softmax_1</title>
-<path fill="#b3de69" stroke="#597d1c" stroke-width="2" d="M493,-998C493,-998 423,-998 423,-998 417,-998 411,-992 411,-986 411,-986 411,-952 411,-952 411,-946 417,-940 423,-940 423,-940 493,-940 493,-940 499,-940 505,-946 505,-952 505,-952 505,-986 505,-986 505,-992 499,-998 493,-998"/>
-<text text-anchor="middle" x="458" y="-971" font-family="Times,serif" font-size="10.00">ptb_softmax_1</text>
-<text text-anchor="middle" x="458" y="-961" font-family="Times,serif" font-size="10.00">SoftmaxOutput</text>
-</g>
-<!-- ptb_softmax_1&#45;&gt;ptb_pred_1 -->
-<g id="edge40" class="edge"><title>ptb_softmax_1&#45;&gt;ptb_pred_1</title>
-<path fill="none" stroke="#737373" d="M458,-929.744C458,-921.204 458,-912.298 458,-904.248"/>
-<polygon fill="#737373" stroke="#737373" points="458,-939.897 453.5,-929.897 458,-934.897 458,-929.897 458,-929.897 458,-929.897 458,-934.897 462.5,-929.897 458,-939.897 458,-939.897"/>
-</g>
-</g>
-</svg>
diff --git a/julia/docs/src/tutorial/mnist.md b/julia/docs/src/tutorial/mnist.md
deleted file mode 100644
index 942752364526..000000000000
--- a/julia/docs/src/tutorial/mnist.md
+++ /dev/null
@@ -1,281 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-Digit Recognition on MNIST
-==========================
-
-In this tutorial, we will work through examples of training a simple
-multi-layer perceptron and then a convolutional neural network (the
-LeNet architecture) on the [MNIST handwritten digit
-dataset](http://yann.lecun.com/exdb/mnist/). The code for this tutorial
-could be found in
-[examples/mnist](https://github.com/apache/incubator-mxnet/tree/master/julia/examples/mnist).  There are also two Jupyter notebooks that expand a little more on the [MLP](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistMLP.ipynb) and the [LeNet](https://github.com/ultradian/julia_notebooks/blob/master/mxnet/mnistLenet.ipynb), using the more general `ArrayDataProvider`. 
-
-Simple 3-layer MLP
-------------------
-
-This is a tiny 3-layer MLP that could be easily trained on CPU. The
-script starts with
-
-```julia
-using MXNet
-```
-
-to load the `MXNet` module. Then we are ready to define the network
-architecture via the [symbolic API](/api/julia/docs/api/user-guide/overview/). We start
-with a placeholder `data` symbol,
-
-```julia
-data = mx.Variable(:data)
-```
-
-and then cascading fully-connected layers and activation functions:
-
-```julia
-fc1  = mx.FullyConnected(data, name=:fc1, num_hidden=128)
-act1 = mx.Activation(fc1, name=:relu1, act_type=:relu)
-fc2  = mx.FullyConnected(act1, name=:fc2, num_hidden=64)
-act2 = mx.Activation(fc2, name=:relu2, act_type=:relu)
-fc3  = mx.FullyConnected(act2, name=:fc3, num_hidden=10)
-```
-
-Note each composition we take the previous symbol as the first argument,
-forming a feedforward chain. The architecture looks like
-
-```
-Input --> 128 units (ReLU) --> 64 units (ReLU) --> 10 units
-```
-
-where the last 10 units correspond to the 10 output classes (digits
-0,...,9). We then add a final `SoftmaxOutput` operation to turn the
-10-dimensional prediction to proper probability values for the 10
-classes:
-
-```julia
-mlp  = mx.SoftmaxOutput(fc3, name=:softmax)
-```
-
-As we can see, the MLP is just a chain of layers. For this case, we can
-also use the `mx.chain` macro. The same architecture above can be
-defined as
-
-```julia
-mlp = @mx.chain mx.Variable(:data)             =>
-  mx.FullyConnected(name=:fc1, num_hidden=128) =>
-  mx.Activation(name=:relu1, act_type=:relu)   =>
-  mx.FullyConnected(name=:fc2, num_hidden=64)  =>
-  mx.Activation(name=:relu2, act_type=:relu)   =>
-  mx.FullyConnected(name=:fc3, num_hidden=10)  =>
-  mx.SoftmaxOutput(name=:softmax)
-```
-
-After defining the architecture, we are ready to load the MNIST data.
-MXNet.jl provide built-in data providers for the MNIST dataset, which
-could automatically download the dataset into
-`Pkg.dir("MXNet")/data/mnist` if necessary. We wrap the code to
-construct the data provider into `mnist-data.jl` so that it could be
-shared by both the MLP example and the LeNet ConvNets example.
-
-```julia
-batch_size = 100
-include("mnist-data.jl")
-train_provider, eval_provider = get_mnist_providers(batch_size)
-```
-
-If you need to write your own data providers for customized data format,
-please refer to [`mx.AbstractDataProvider`](@ref).
-
-Given the architecture and data, we can instantiate an *model* to do the
-actual training. `mx.FeedForward` is the built-in model that is suitable
-for most feed-forward architectures. When constructing the model, we
-also specify the *context* on which the computation should be carried
-out. Because this is a really tiny MLP, we will just run on a single CPU
-device.
-
-```julia
-model = mx.FeedForward(mlp, context=mx.cpu())
-```
-
-You can use a `mx.gpu()` or if a list of devices (e.g.
-`[mx.gpu(0), mx.gpu(1)]`) is provided, data-parallelization will be used
-automatically. But for this tiny example, using a GPU device might not
-help.
-
-The last thing we need to specify is the optimization algorithm (a.k.a.
-*optimizer*) to use. We use the basic SGD with a fixed learning rate 0.1
-, momentum 0.9 and weight decay 0.00001:
-
-```julia
-optimizer = mx.SGD(η=0.1, μ=0.9, λ=0.00001)
-```
-
-Now we can do the training. Here the `n_epoch` parameter specifies that
-we want to train for 20 epochs. We also supply a `eval_data` to monitor
-validation accuracy on the validation set.
-
-```julia
-mx.fit(model, optimizer, train_provider, n_epoch=20, eval_data=eval_provider)
-```
-
-Here is a sample output
-
-```
-INFO: Start training on [CPU0]
-INFO: Initializing parameters...
-INFO: Creating KVStore...
-INFO: == Epoch 001 ==========
-INFO: ## Training summary
-INFO:       :accuracy = 0.7554
-INFO:            time = 1.3165 seconds
-INFO: ## Validation summary
-INFO:       :accuracy = 0.9502
-...
-INFO: == Epoch 020 ==========
-INFO: ## Training summary
-INFO:       :accuracy = 0.9949
-INFO:            time = 0.9287 seconds
-INFO: ## Validation summary
-INFO:       :accuracy = 0.9775
-```
-
-Convolutional Neural Networks
------------------------------
-
-In the second example, we show a slightly more complicated architecture
-that involves convolution and pooling. This architecture for the MNIST
-is usually called the \[LeNet\]\_. The first part of the architecture is
-listed below:
-
-```julia
-# input
-data = mx.Variable(:data)
-
-# first conv
-conv1 = @mx.chain mx.Convolution(data, kernel=(5,5), num_filter=20)  =>
-                  mx.Activation(act_type=:tanh) =>
-                  mx.Pooling(pool_type=:max, kernel=(2,2), stride=(2,2))
-
-# second conv
-conv2 = @mx.chain mx.Convolution(conv1, kernel=(5,5), num_filter=50) =>
-                  mx.Activation(act_type=:tanh) =>
-                  mx.Pooling(pool_type=:max, kernel=(2,2), stride=(2,2))
-```
-
-We basically defined two convolution modules. Each convolution module is
-actually a chain of `Convolution`, `tanh` activation and then max
-`Pooling` operations.
-
-Each sample in the MNIST dataset is a 28x28 single-channel grayscale
-image. In the tensor format used by `NDArray`, a batch of 100 samples is
-a tensor of shape `(28,28,1,100)`. The convolution and pooling operates
-in the spatial axis, so `kernel=(5,5)` indicate a square region of
-5-width and 5-height. The rest of the architecture follows as:
-
-```julia
-# first fully-connected
-fc1   = @mx.chain mx.Flatten(conv2) =>
-                  mx.FullyConnected(num_hidden=500) =>
-                  mx.Activation(act_type=:tanh)
-
-# second fully-connected
-fc2   = mx.FullyConnected(fc1, num_hidden=10)
-
-# softmax loss
-lenet = mx.Softmax(fc2, name=:softmax)
-```
-
-Note a fully-connected operator expects the input to be a matrix.
-However, the results from spatial convolution and pooling are 4D
-tensors. So we explicitly used a `Flatten` operator to flat the tensor,
-before connecting it to the `FullyConnected` operator.
-
-The rest of the network is the same as the previous MLP example. As
-before, we can now load the MNIST dataset:
-
-```julia
-batch_size = 100
-include("mnist-data.jl")
-train_provider, eval_provider = get_mnist_providers(batch_size; flat=false)
-```
-
-Note we specified `flat=false` to tell the data provider to provide 4D
-tensors instead of 2D matrices because the convolution operators needs
-correct spatial shape information. We then construct a feedforward model
-on GPU, and train it.
-
-```julia
-# fit model
-model = mx.FeedForward(lenet, context=mx.gpu())
-
-# optimizer
-optimizer = mx.SGD(η=0.05, μ=0.9, λ=0.00001)
-
-# fit parameters
-mx.fit(model, optimizer, train_provider, n_epoch=20, eval_data=eval_provider)
-```
-
-And here is a sample of running outputs:
-
-```
-INFO: == Epoch 001 ==========
-INFO: ## Training summary
-INFO:       :accuracy = 0.6750
-INFO:            time = 4.9814 seconds
-INFO: ## Validation summary
-INFO:       :accuracy = 0.9712
-...
-INFO: == Epoch 020 ==========
-INFO: ## Training summary
-INFO:       :accuracy = 1.0000
-INFO:            time = 4.0086 seconds
-INFO: ## Validation summary
-INFO:       :accuracy = 0.9915
-```
-
-Predicting with a trained model
--------------------------------
-
-Predicting with a trained model is very simple. By calling `mx.predict`
-with the model and a data provider, we get the model output as a Julia
-Array:
-
-```julia
-probs = mx.predict(model, eval_provider)
-```
-
-The following code shows a stupid way of getting all the labels from the
-data provider, and compute the prediction accuracy manually:
-
-```julia
-# collect all labels from eval data
-labels = reduce(
-  vcat,
-  copy(mx.get(eval_provider, batch, :softmax_label)) for batch ∈ eval_provider)
-# labels are 0...9
-labels .= labels .+ 1
-
-# Now we use compute the accuracy
-pred = map(i -> argmax(probs[1:10, i]), 1:size(probs, 2))
-correct = sum(pred .== labels)
-@printf "Accuracy on eval set: %.2f%%\n" 100correct/length(labels)
-```
-
-Alternatively, when the dataset is huge, one can provide a callback to
-`mx.predict`, then the callback function will be invoked with the
-outputs of each mini-batch. The callback could, for example, write the
-data to disk for future inspection. In this case, no value is returned
-from `mx.predict`. See also predict.
diff --git a/julia/docs/src/user-guide/overview.md b/julia/docs/src/user-guide/overview.md
index 342448a15bed..6b8d954ee4a1 100644
--- a/julia/docs/src/user-guide/overview.md
+++ b/julia/docs/src/user-guide/overview.md
@@ -294,15 +294,6 @@ snippet shows a simple 2-layer MLP construction, using a hidden layer of
 using MXNet
 ```
 
-```@example fcnet
-net = mx.Variable(:data)
-net = mx.FullyConnected(net, name=:fc1, num_hidden=128)
-net = mx.Activation(net, name=:relu1, act_type=:relu)
-net = mx.FullyConnected(net, name=:fc2, num_hidden=64)
-net = mx.SoftmaxOutput(net, name=:out)
-print(net)  # debug printing
-```
-
 Each time we take the previous symbol, and compose with an operation.
 Unlike the simple `+` example above, the *operations* here are "bigger"
 ones, that correspond to common computation layers in deep neural
diff --git a/julia/examples/char-lstm/.gitignore b/julia/examples/char-lstm/.gitignore
deleted file mode 100644
index a393ee67b410..000000000000
--- a/julia/examples/char-lstm/.gitignore
+++ /dev/null
@@ -1,6 +0,0 @@
-input.txt
-vocab.dat
-checkpoints
-visualize.dot
-visualize.svg
-visualize.png
diff --git a/julia/examples/char-lstm/README.md b/julia/examples/char-lstm/README.md
deleted file mode 100644
index 155f29603623..000000000000
--- a/julia/examples/char-lstm/README.md
+++ /dev/null
@@ -1,138 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# LSTM char-rnn
-
-Because we explicitly unroll the LSTM/RNN over time for a fixed sequence length,
-it is easy to fit this model into the existing FeedForward model and re-use everything.
-To get a more flexible LSTM/RNN implementation that avoids explicit unrolling and
-deals with variable-length sequences, we still need to implement another model
-beside the existing FeedForward.
-
-To run this example, you will need to install two extra Julia packages: `Iterators.jl`
-and `StatsBase.jl`.
-
-## Training
-
-This example is adapted from the
-[example in Python binding](https://github.com/dmlc/mxnet-notebooks/blob/master/python/tutorials/char_lstm.ipynb) of
-MXNet. The data `input.txt` can be downloaded [here](https://github.com/dmlc/web-data/tree/master/mxnet/tinyshakespeare).
-
-Modify parameters in [config.jl](config.jl) and then run [train.jl](train.jl). An example output
-of training looks like this:
-```
-...
-INFO: Speed: 357.72 samples/sec
-INFO: == Epoch 020 ==========
-INFO: ## Training summary
-INFO:                NLL = 1.4672
-INFO:         perplexity = 4.3373
-INFO:               time = 87.2631 seconds
-INFO: ## Validation summary
-INFO:                NLL = 1.6374
-INFO:         perplexity = 5.1418
-INFO: Saved checkpoint to 'char-lstm/checkpoints/ptb-0020.params'
-INFO: Speed: 368.74 samples/sec
-INFO: Speed: 361.04 samples/sec
-INFO: Speed: 360.02 samples/sec
-INFO: Speed: 362.34 samples/sec
-INFO: Speed: 360.80 samples/sec
-INFO: Speed: 362.77 samples/sec
-INFO: Speed: 357.18 samples/sec
-INFO: Speed: 355.30 samples/sec
-INFO: Speed: 362.33 samples/sec
-INFO: Speed: 359.23 samples/sec
-INFO: Speed: 358.09 samples/sec
-INFO: Speed: 356.89 samples/sec
-INFO: Speed: 371.91 samples/sec
-INFO: Speed: 372.24 samples/sec
-INFO: Speed: 356.59 samples/sec
-INFO: Speed: 356.64 samples/sec
-INFO: Speed: 360.24 samples/sec
-INFO: Speed: 360.32 samples/sec
-INFO: Speed: 362.38 samples/sec
-INFO: == Epoch 021 ==========
-INFO: ## Training summary
-INFO:                NLL = 1.4655
-INFO:         perplexity = 4.3297
-INFO:               time = 86.9243 seconds
-INFO: ## Validation summary
-INFO:                NLL = 1.6366
-INFO:         perplexity = 5.1378
-INFO: Saved checkpoint to 'examples/char-lstm/checkpoints/ptb-0021.params'
-```
-
-## Sampling
-
-Run [sampler.jl](sampler.jl) to generate sample sentences from the trained model. Some example sentences are
-```
-## Sample 1
-all have sir,
-Away will fill'd in His time, I'll keep her, do not madam, if they here? Some more ha?
-
-## Sample 2
-am.
-
-CLAUDIO:
-Hone here, let her, the remedge, and I know not slept a likely, thou some soully free?
-
-## Sample 3
-arrel which noble thing
-The exchnachsureding worns: I ne'er drunken Biancas, fairer, than the lawfu?
-
-## Sample 4
-augh assalu, you'ld tell me corn;
-Farew. First, for me of a loved. Has thereat I knock you presents?
-
-## Sample 5
-ame the first answer.
-
-MARIZARINIO:
-Door of Angelo as her lord, shrield liken Here fellow the fool ?
-
-## Sample 6
-ad well.
-
-CLAUDIO:
-Soon him a fellows here; for her fine edge in a bogms' lord's wife.
-
-LUCENTIO:
-I?
-
-## Sample 7
-adrezilian measure.
-
-LUCENTIO:
-So, help'd you hath nes have a than dream's corn, beautio, I perchas?
-
-## Sample 8
-as eatter me;
-The girlly: and no other conciolation!
-
-BISTRUMIO:
-I have be rest girl. O, that I a h?
-
-## Sample 9
-and is intend you sort:
-What held her all 'clama's for maffice. Some servant.' what I say me the cu?
-
-## Sample 10
-an thoughts will said in our pleasue,
-Not scanin on him that you live; believaries she.
-
-ISABELLLLL?
-```
diff --git a/julia/examples/char-lstm/config.jl b/julia/examples/char-lstm/config.jl
deleted file mode 100644
index c6ed0ff63b3c..000000000000
--- a/julia/examples/char-lstm/config.jl
+++ /dev/null
@@ -1,40 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-const DROPOUT        = 0
-const BATCH_SIZE     = 32
-const SEQ_LENGTH     = 32
-const DIM_HIDDEN     = 256
-const DIM_EMBED      = 256
-const LSTM_N_LAYER   = 2
-const N_EPOCH        = 21
-const BASE_LR        = 0.01
-const WEIGHT_DECAY   = 0.00001
-const CLIP_GRADIENT  = 1
-const NAME           = :ptb
-const N_GPU          = 1
-const USE_GPU        = true
-const DATA_TR_RATIO  = 0.9
-const CKPOINT_PREFIX = joinpath(@__DIR__, "checkpoints/$NAME")
-
-const BATCH_SIZE_SMP = 10
-const SAMPLE_LENGTH  = 100
-const SAMPLE_START   = 'a'
-
-const UNKNOWN_CHAR   = Char(0)
-const INPUT_FILE     = joinpath(@__DIR__, "input.txt")
-const VOCAB_FILE     = joinpath(@__DIR__, "vocab.dat")
diff --git a/julia/examples/char-lstm/lstm.jl b/julia/examples/char-lstm/lstm.jl
deleted file mode 100644
index 6f6640e9562b..000000000000
--- a/julia/examples/char-lstm/lstm.jl
+++ /dev/null
@@ -1,175 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# An explicitly unrolled LSTM with fixed sequence length.
-using MXNet
-
-#--LSTMState
-struct LSTMState
-  c :: mx.SymbolicNode
-  h :: mx.SymbolicNode
-end
-#--/LSTMState
-
-#--LSTMParam
-struct LSTMParam
-  i2h_W :: mx.SymbolicNode
-  h2h_W :: mx.SymbolicNode
-  i2h_b :: mx.SymbolicNode
-  h2h_b :: mx.SymbolicNode
-end
-#--/LSTMParam
-
-#--lstm_cell
-function lstm_cell(data::mx.SymbolicNode, prev_state::LSTMState, param::LSTMParam;
-                   num_hidden::Int=512, dropout::Real=0, name::Symbol=gensym())
-
-  if dropout > 0
-    data = mx.Dropout(data, p=dropout)
-  end
-
-  i2h = mx.FullyConnected(data, weight=param.i2h_W, bias=param.i2h_b,
-                          num_hidden=4num_hidden, name=Symbol(name, "_i2h"))
-  h2h = mx.FullyConnected(prev_state.h, weight=param.h2h_W, bias=param.h2h_b,
-                          num_hidden=4num_hidden, name=Symbol(name, "_h2h"))
-
-  gates = mx.SliceChannel(i2h + h2h, num_outputs=4, name=Symbol(name, "_gates"))
-
-  in_gate     = mx.Activation(gates[1], act_type=:sigmoid)
-  in_trans    = mx.Activation(gates[2], act_type=:tanh)
-  forget_gate = mx.Activation(gates[3], act_type=:sigmoid)
-  out_gate    = mx.Activation(gates[4], act_type=:sigmoid)
-
-  next_c = (forget_gate .* prev_state.c) + (in_gate .* in_trans)
-  next_h = out_gate .* mx.Activation(next_c, act_type=:tanh)
-
-  return LSTMState(next_c, next_h)
-end
-#--/lstm_cell
-
-#--LSTM-part1
-function LSTM(n_layer::Int, seq_len::Int, dim_hidden::Int, dim_embed::Int, n_class::Int;
-              dropout::Real=0, name::Symbol=gensym(), output_states::Bool=false)
-
-  # placeholder nodes for all parameters
-  embed_W = mx.Variable(Symbol(name, "_embed_weight"))
-  pred_W  = mx.Variable(Symbol(name, "_pred_weight"))
-  pred_b  = mx.Variable(Symbol(name, "_pred_bias"))
-
-  layer_param_states = map(1:n_layer) do i
-    param = LSTMParam(mx.Variable(Symbol(name, "_l$(i)_i2h_weight")),
-                      mx.Variable(Symbol(name, "_l$(i)_h2h_weight")),
-                      mx.Variable(Symbol(name, "_l$(i)_i2h_bias")),
-                      mx.Variable(Symbol(name, "_l$(i)_h2h_bias")))
-    state = LSTMState(mx.Variable(Symbol(name, "_l$(i)_init_c")),
-                      mx.Variable(Symbol(name, "_l$(i)_init_h")))
-    (param, state)
-  end
-  #...
-  #--/LSTM-part1
-
-  #--LSTM-part2
-  # now unroll over time
-  outputs = mx.SymbolicNode[]
-  for t = 1:seq_len
-    data   = mx.Variable(Symbol(name, "_data_$t"))
-    label  = mx.Variable(Symbol(name, "_label_$t"))
-    hidden = mx.FullyConnected(data, weight=embed_W, num_hidden=dim_embed,
-                               no_bias=true, name=Symbol(name, "_embed_$t"))
-
-    # stack LSTM cells
-    for i = 1:n_layer
-      l_param, l_state = layer_param_states[i]
-      dp = i == 1 ? 0 : dropout # don't do dropout for data
-      next_state = lstm_cell(hidden, l_state, l_param, num_hidden=dim_hidden, dropout=dp,
-                             name=Symbol(name, "_lstm_$t"))
-      hidden = next_state.h
-      layer_param_states[i] = (l_param, next_state)
-    end
-
-    # prediction / decoder
-    if dropout > 0
-      hidden = mx.Dropout(hidden, p=dropout)
-    end
-    pred = mx.FullyConnected(hidden, weight=pred_W, bias=pred_b, num_hidden=n_class,
-                             name=Symbol(name, "_pred_$t"))
-    smax = mx.SoftmaxOutput(pred, label, name=Symbol(name, "_softmax_$t"))
-    push!(outputs, smax)
-  end
-  #...
-  #--/LSTM-part2
-
-  #--LSTM-part3
-  # append block-gradient nodes to the final states
-  for i = 1:n_layer
-    l_param, l_state = layer_param_states[i]
-    final_state = LSTMState(mx.BlockGrad(l_state.c, name=Symbol(name, "_l$(i)_last_c")),
-                            mx.BlockGrad(l_state.h, name=Symbol(name, "_l$(i)_last_h")))
-    layer_param_states[i] = (l_param, final_state)
-  end
-
-  # now group all outputs together
-  if output_states
-    outputs = outputs ∪ [x[2].c for x in layer_param_states] ∪
-                        [x[2].h for x in layer_param_states]
-  end
-  return mx.Group(outputs...)
-end
-#--/LSTM-part3
-
-
-# Negative Log-likelihood
-mutable struct NLL <: mx.AbstractEvalMetric
-  nll_sum  :: Float64
-  n_sample :: Int
-
-  NLL() = new(0.0, 0)
-end
-
-function mx.update!(metric::NLL, labels::Vector{<:mx.NDArray}, preds::Vector{<:mx.NDArray})
-  @assert length(labels) == length(preds)
-  nll = 0.0
-  for (label, pred) in zip(labels, preds)
-    @mx.nd_as_jl ro=(label, pred) begin
-      nll -= sum(
-        log.(
-          max.(
-            getindex.(
-            (pred,),
-            round.(Int,label .+ 1),
-            1:length(label)),
-          1e-20)
-        )
-      )
-    end
-  end
-
-  nll = nll / length(labels)
-  metric.nll_sum += nll
-  metric.n_sample += length(labels[1])
-end
-
-function mx.get(metric :: NLL)
-  nll  = metric.nll_sum / metric.n_sample
-  perp = exp(nll)
-  return [(:NLL, nll), (:perplexity, perp)]
-end
-
-function mx.reset!(metric :: NLL)
-  metric.nll_sum  = 0.0
-  metric.n_sample = 0
-end
diff --git a/julia/examples/char-lstm/sampler.jl b/julia/examples/char-lstm/sampler.jl
deleted file mode 100644
index 1a4aada22957..000000000000
--- a/julia/examples/char-lstm/sampler.jl
+++ /dev/null
@@ -1,89 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-include(joinpath(@__DIR__, "config.jl"))
-include(joinpath(@__DIR__, "lstm.jl"))
-include(joinpath(@__DIR__, "seq-data.jl"))
-
-using StatsBase
-using MXNet
-
-# load vocabulary
-vocab   = build_vocabulary(INPUT_FILE, VOCAB_FILE)
-n_class = length(vocab)
-
-# prepare data provider
-jl_data = Pair[(Symbol(NAME, "_data_$t") => zeros(mx.MX_float, (length(vocab), BATCH_SIZE_SMP)))
-               for t = 1:1]
-jl_c    = Pair[(Symbol(NAME, "_l$(l)_init_c") => zeros(mx.MX_float, (DIM_HIDDEN, BATCH_SIZE_SMP)))
-               for l = 1:LSTM_N_LAYER]
-jl_h    = Pair[(Symbol(NAME, "_l$(l)_init_h") => zeros(mx.MX_float, (DIM_HIDDEN, BATCH_SIZE_SMP)))
-               for l = 1:LSTM_N_LAYER]
-
-# the first input in the sequence
-jl_data_start = jl_data[1].second
-jl_data_start[char_idx(vocab, SAMPLE_START),:] = 1
-
-# define a LSTM with sequence length 1, also output states so that we could manually copy the states
-# when sampling the next char
-lstm  = LSTM(LSTM_N_LAYER, 1, DIM_HIDDEN, DIM_EMBED, n_class, name=NAME, output_states=true)
-model = mx.FeedForward(lstm, context=mx.cpu())
-
-# load parameters from traind LSTM, though the sequence length is different, since the weights are shared
-# over time, this should be compatible.
-model = mx.load_checkpoint(model, CKPOINT_PREFIX, N_EPOCH, allow_different_arch=true)
-
-# prepare outputs
-Base.zero(::Type{Char}) = Char(0)
-output_samples = zeros(Char, (SAMPLE_LENGTH, BATCH_SIZE_SMP))
-output_samples[1, :] = SAMPLE_START
-
-# build inverse vocabulary for convenience
-inv_vocab = Dict(v => k for (k,v) in vocab)
-
-# do prediction and sampling step by step
-for t = 2:SAMPLE_LENGTH-1
-  data    = mx.ArrayDataProvider(jl_data ∪ jl_c ∪ jl_h)
-  preds   = mx.predict(model, data)
-
-  # the first output is prediction
-  outputs = preds[1]
-
-  # do sampling and init the next inputs
-  jl_data_start[:] = 0
-  for i = 1:BATCH_SIZE_SMP
-    prob = WeightVec(outputs[:, i])
-    k    = sample(prob)
-    output_samples[t, i] = inv_vocab[k]
-    jl_data_start[k, i]  = 1
-  end
-
-  # copy the states over
-  for l = 1:LSTM_N_LAYER
-    copy!(jl_c[l][2], preds[1+l])
-    copy!(jl_h[l][2], preds[1+LSTM_N_LAYER+l])
-  end
-end
-
-output_texts = [join(output_samples[:,i]) for i = 1:BATCH_SIZE_SMP]
-output_texts = [replace(x, UNKNOWN_CHAR, '?') for x in output_texts]
-
-for (i, text) in enumerate(output_texts)
-  println("## Sample $i")
-  println(text)
-  println()
-end
diff --git a/julia/examples/char-lstm/seq-data.jl b/julia/examples/char-lstm/seq-data.jl
deleted file mode 100644
index 0df110322317..000000000000
--- a/julia/examples/char-lstm/seq-data.jl
+++ /dev/null
@@ -1,137 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# Simple data provider that load text
-using Base.Iterators
-using MXNet
-
-function build_vocabulary(corpus_fn::AbstractString, vocab_fn::AbstractString;
-                          max_vocab = 10000)
-  if isfile(vocab_fn)
-    @info("Vocabulary already exists, reusing $vocab_fn...")
-    vocab = Dict{Char,Int}(w => i for (i,w) in enumerate(read(vocab_fn, String)))
-  else
-    # count symbol frequency
-    dict = Dict{Char,Int}()
-    open(corpus_fn) do io
-      for line in eachline(io)
-        for c in line
-          dict[c] = get(dict, c, 0) + 1
-        end
-      end
-    end
-
-    vocab = sort(collect(dict), by=x->-x.second)
-    vocab = vocab[1:min(max_vocab,length(vocab))]
-    open(vocab_fn, "w") do io
-      for x in vocab
-        print(io, x.first)
-      end
-    end
-
-    vocab = Dict(x.first => i for (i,x) in enumerate(vocab))
-  end
-  vocab[UNKNOWN_CHAR] = length(vocab)
-  return vocab
-end
-
-#--CharSeqProvider
-mutable struct CharSeqProvider <: mx.AbstractDataProvider
-  text       :: AbstractString
-  batch_size :: Int
-  seq_len    :: Int
-  vocab      :: Dict{Char,Int}
-
-  prefix     :: Symbol
-  n_layer    :: Int
-  dim_hidden :: Int
-end
-#--/CharSeqProvider
-
-function mx.get_batch_size(p :: CharSeqProvider)
-  p.batch_size
-end
-
-#--provide
-function mx.provide_data(p :: CharSeqProvider)
-  [(Symbol(p.prefix, "_data_$t"), (length(p.vocab), p.batch_size)) for t = 1:p.seq_len] ∪
-  [(Symbol(p.prefix, "_l$(l)_init_c"), (p.dim_hidden, p.batch_size)) for l=1:p.n_layer] ∪
-  [(Symbol(p.prefix, "_l$(l)_init_h"), (p.dim_hidden, p.batch_size)) for l=1:p.n_layer]
-end
-function mx.provide_label(p :: CharSeqProvider)
-  [(Symbol(p.prefix, "_label_$t"), (p.batch_size,)) for t = 1:p.seq_len]
-end
-#--/provide
-
-#--eachbatch-part1
-function mx.eachbatch(p::CharSeqProvider)
-  data_all  = [mx.zeros(shape) for (name, shape) in mx.provide_data(p)]
-  label_all = [mx.zeros(shape) for (name, shape) in mx.provide_label(p)]
-
-  data_jl = [copy(x) for x in data_all]
-  label_jl= [copy(x) for x in label_all]
-
-  batch = mx.DataBatch(data_all, label_all, p.batch_size)
-  #...
-  #--/eachbatch-part1
-
-  #--eachbatch-part2
-  #...
-  function _text_iter(c::Channel)
-    text = p.text
-
-    n_batch = floor(Int, length(text) / p.batch_size / p.seq_len)
-    text = text[1:n_batch*p.batch_size*p.seq_len] # discard tailing
-    idx_all = 1:length(text)
-
-    for idx_batch in partition(idx_all, p.batch_size*p.seq_len)
-      for i = 1:p.seq_len
-        data_jl[i][:] .= 0
-        label_jl[i][:] .= 0
-      end
-
-      for (i, idx_seq) in enumerate(partition(idx_batch, p.seq_len))
-        for (j, idx) in enumerate(idx_seq)
-          c_this = text[idx]
-          c_next = idx == length(text) ? UNKNOWN_CHAR : text[idx+1]
-          data_jl[j][char_idx(vocab,c_this),i] = 1
-          label_jl[j][i] = char_idx(vocab,c_next)-1
-        end
-      end
-
-      for i = 1:p.seq_len
-        copy!(data_all[i], data_jl[i])
-        copy!(label_all[i], label_jl[i])
-      end
-
-      put!(c, batch)
-    end
-  end
-
-  return Channel(_text_iter)
-end
-#--/eachbatch-part2
-
-# helper function to convert a char into index in vocabulary
-function char_idx(vocab :: Dict{Char,Int}, c :: Char)
-  if haskey(vocab, c)
-    vocab[c]
-  else
-    vocab[UNKNOWN_CHAR]
-  end
-end
-
diff --git a/julia/examples/char-lstm/train.jl b/julia/examples/char-lstm/train.jl
deleted file mode 100644
index 7dbdefd42e41..000000000000
--- a/julia/examples/char-lstm/train.jl
+++ /dev/null
@@ -1,59 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-include(joinpath(@__DIR__, "config.jl"))
-include(joinpath(@__DIR__, "lstm.jl"))
-include(joinpath(@__DIR__, "seq-data.jl"))
-
-# build vocabulary
-vocab   = build_vocabulary(INPUT_FILE, VOCAB_FILE)
-n_class = length(vocab)
-
-#--LSTM
-# define LSTM
-lstm = LSTM(LSTM_N_LAYER, SEQ_LENGTH, DIM_HIDDEN, DIM_EMBED,
-            n_class, dropout=DROPOUT, name=NAME)
-#--/LSTM
-
-#--data
-# load data
-text_all  = read(INPUT_FILE, String)
-len_train = round(Int, length(text_all)*DATA_TR_RATIO)
-text_tr   = text_all[1:len_train]
-text_val  = text_all[len_train+1:end]
-
-data_tr   = CharSeqProvider(text_tr, BATCH_SIZE, SEQ_LENGTH, vocab, NAME,
-                            LSTM_N_LAYER, DIM_HIDDEN)
-data_val  = CharSeqProvider(text_val, BATCH_SIZE, SEQ_LENGTH, vocab, NAME,
-                            LSTM_N_LAYER, DIM_HIDDEN)
-#--/data
-
-# set up training
-if USE_GPU
-  context = [mx.gpu(i) for i = 0:N_GPU-1]
-else
-  context = mx.cpu()
-end
-
-#--train
-model = mx.FeedForward(lstm, context=context)
-optimizer = mx.ADAM(η=BASE_LR, λ=WEIGHT_DECAY, clip=CLIP_GRADIENT)
-
-mx.fit(model, optimizer, data_tr, eval_data=data_val, n_epoch=N_EPOCH,
-       initializer=mx.UniformInitializer(0.1),
-       callbacks=[mx.speedometer(), mx.do_checkpoint(CKPOINT_PREFIX)], eval_metric=NLL())
-#--/train
diff --git a/julia/examples/char-lstm/visualize.jl b/julia/examples/char-lstm/visualize.jl
deleted file mode 100644
index dd483940095e..000000000000
--- a/julia/examples/char-lstm/visualize.jl
+++ /dev/null
@@ -1,32 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-include(joinpath(@__DIR__, "config.jl"))
-include(joinpath(@__DIR__, "lstm.jl"))
-
-using MXNet
-
-vis_n_layer = 1
-vis_seq_len = 2
-vis_n_class = 128
-
-lstm  = LSTM(vis_n_layer, vis_seq_len, DIM_HIDDEN, DIM_EMBED, vis_n_class, name=NAME, output_states=true)
-
-open("visualize.dot", "w") do io
-  println(io, mx.to_graphviz(lstm))
-end
-run(pipeline(`dot -Tsvg visualize.dot`, stdout="visualize.svg"))
diff --git a/julia/examples/cifar10/cifar10.jl b/julia/examples/cifar10/cifar10.jl
deleted file mode 100644
index a00664ce3a50..000000000000
--- a/julia/examples/cifar10/cifar10.jl
+++ /dev/null
@@ -1,101 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-using MXNet
-
-#--------------------------------------------------------------------------------
-# Helper functions to construct larger networks
-
-# basic Conv + BN + ReLU factory
-function conv_factory(data, num_filter, kernel; stride=(1,1), pad=(0,0), act_type=:relu)
-  conv = mx.Convolution(data, num_filter=num_filter, kernel=kernel, stride=stride, pad=pad)
-  bn   = mx.BatchNorm(conv)
-  act  = mx.Activation(bn, act_type=act_type)
-  return act
-end
-
-# simple downsampling factory
-function downsample_factory(data, ch_3x3)
-  # conv 3x3
-  conv = conv_factory(data, ch_3x3, (3,3), stride=(2,2), pad=(1,1))
-  # pool
-  pool = mx.Pooling(data, kernel=(3,3), stride=(2,2), pool_type=:max)
-  # concat
-  concat = mx.Concat(conv, pool)
-  return concat
-end
-
-# a simple module
-function simple_factory(data, ch_1x1, ch_3x3)
-  # 1x1
-  conv1x1 = conv_factory(data, ch_1x1, (1,1); pad=(0,0))
-  # 3x3
-  conv3x3 = conv_factory(data, ch_3x3, (3,3); pad=(1,1))
-  # concat
-  concat = mx.Concat(conv1x1, conv3x3)
-  return concat
-end
-
-
-#--------------------------------------------------------------------------------
-# Actual architecture
-data    = mx.Variable(:data)
-conv1   = conv_factory(data, 96, (3,3); pad=(1,1), act_type=:relu)
-in3a    = simple_factory(conv1, 32, 32)
-in3b    = simple_factory(in3a, 32, 48)
-in3c    = downsample_factory(in3b, 80)
-in4a    = simple_factory(in3c, 112, 48)
-in4b    = simple_factory(in4a, 96, 64)
-in4c    = simple_factory(in4b, 80, 80)
-in4d    = simple_factory(in4b, 48, 96)
-in4e    = downsample_factory(in4d, 96)
-in5a    = simple_factory(in4e, 176, 160)
-in5b    = simple_factory(in5a, 176, 160)
-pool    = mx.Pooling(in5b, pool_type=:avg, kernel=(7,7), name=:global_pool)
-flatten = mx.Flatten(pool, name=:flatten1)
-fc      = mx.FullyConnected(flatten, num_hidden=10, name=:fc1)
-softmax = mx.SoftmaxOutput(fc, name=:loss)
-
-
-#--------------------------------------------------------------------------------
-# Prepare data
-filenames = mx.get_cifar10()
-batch_size = 128
-num_epoch  = 10
-num_gpus   = 8
-
-train_provider = mx.ImageRecordProvider(label_name=:loss_label,
-        path_imgrec=filenames[:train], mean_img=filenames[:mean],
-        rand_crop=true, rand_mirror=true, data_shape=(28,28,3),
-        batch_size=batch_size, preprocess_threads=1)
-test_provider = mx.ImageRecordProvider(label_name=:loss_label,
-        path_imgrec=filenames[:test], mean_img=filenames[:mean],
-        rand_crop=false, rand_mirror=false, data_shape=(28,28,3),
-        batch_size=batch_size, preprocess_threads=1)
-
-
-#--------------------------------------------------------------------------------
-# Training model
-gpus = [mx.Context(mx.GPU, i) for i = 0:num_gpus-1]
-model = mx.FeedForward(softmax, context=gpus)
-
-# optimizer
-optimizer = mx.SGD(η=0.05, μ=0.9, λ=0.0001)
-
-# fit parameters
-mx.fit(model, optimizer, train_provider, n_epoch=num_epoch, eval_data=test_provider,
-       initializer=mx.UniformInitializer(0.07), callbacks=[mx.speedometer()])
diff --git a/julia/examples/mnist/lenet-stn.jl b/julia/examples/mnist/lenet-stn.jl
deleted file mode 100644
index 95cd0955d402..000000000000
--- a/julia/examples/mnist/lenet-stn.jl
+++ /dev/null
@@ -1,81 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-using MXNet
-
-#--------------------------------------------------------------------------------
-# define lenet with stn layer
-
-
-
-# input
-data = mx.Variable(:data)
-
-
-# the localisation network in lenet-stn
-# it will increase acc about more than 1%, when num-epoch >=15
-# The localization net just takes the data as input and must output a vector in R^n
-loc_net = @mx.chain mx.Convolution(data, num_filter=10, kernel=(5, 5), stride=(2,2)) =>
-                    mx.Activation(act_type=:relu) =>
-                    mx.Pooling( kernel=(2, 2), stride=(2, 2), pool_type=:max) =>
-                    mx.Convolution( num_filter=10, kernel=(3, 3), stride=(2,2), pad=(1, 1)) =>
-                    mx.Activation(act_type=:relu) =>
-                    mx.Pooling( global_pool=true, kernel=(2, 2), pool_type=:avg) =>
-                    mx.Flatten() =>
-                    mx.FullyConnected(num_hidden=6, name=:stn_loc)
-
-data=mx.SpatialTransformer(data,loc_net, target_shape = (28,28), transform_type="affine", sampler_type="bilinear")
-
-# first conv
-conv1 = @mx.chain mx.Convolution(data, kernel=(5,5), num_filter=20)  =>
-                  mx.Activation(act_type=:tanh) =>
-                  mx.Pooling(pool_type=:max, kernel=(2,2), stride=(2,2))
-
-# second conv
-conv2 = @mx.chain mx.Convolution(conv1, kernel=(5,5), num_filter=50) =>
-                  mx.Activation(act_type=:tanh) =>
-                  mx.Pooling(pool_type=:max, kernel=(2,2), stride=(2,2))
-
-# first fully-connected
-fc1   = @mx.chain mx.Flatten(conv2) =>
-                  mx.FullyConnected(num_hidden=500) =>
-                  mx.Activation(act_type=:tanh)
-
-# second fully-connected
-fc2   = mx.FullyConnected(fc1, num_hidden=10)
-
-# softmax loss
-lenet = mx.SoftmaxOutput(fc2, name=:softmax)
-
-
-#--------------------------------------------------------------------------------
-
-# load data
-batch_size = 100
-include("mnist-data.jl")
-train_provider, eval_provider = get_mnist_providers(batch_size; flat=false)
-
-#--------------------------------------------------------------------------------
-# fit model
-model = mx.FeedForward(lenet, context=mx.cpu())
-
-# optimizer
-optimizer = mx.ADAM(η=0.01, λ=0.00001)
-
-# fit parameters
-initializer=mx.XavierInitializer(distribution = mx.xv_uniform, regularization = mx.xv_avg, magnitude = 1)
-mx.fit(model, optimizer, train_provider, n_epoch=20, eval_data=eval_provider,initializer=initializer)
diff --git a/julia/examples/mnist/lenet.jl b/julia/examples/mnist/lenet.jl
deleted file mode 100644
index 5ee15d69dd1b..000000000000
--- a/julia/examples/mnist/lenet.jl
+++ /dev/null
@@ -1,62 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-using MXNet
-
-#--------------------------------------------------------------------------------
-# define lenet
-
-# input
-data = mx.Variable(:data)
-
-# first conv
-conv1 = @mx.chain mx.Convolution(data, kernel=(5,5), num_filter=20)  =>
-                  mx.Activation(act_type=:tanh) =>
-                  mx.Pooling(pool_type=:max, kernel=(2,2), stride=(2,2))
-
-# second conv
-conv2 = @mx.chain mx.Convolution(conv1, kernel=(5,5), num_filter=50) =>
-                  mx.Activation(act_type=:tanh) =>
-                  mx.Pooling(pool_type=:max, kernel=(2,2), stride=(2,2))
-
-# first fully-connected
-fc1   = @mx.chain mx.Flatten(conv2) =>
-                  mx.FullyConnected(num_hidden=500) =>
-                  mx.Activation(act_type=:tanh)
-
-# second fully-connected
-fc2   = mx.FullyConnected(fc1, num_hidden=10)
-
-# softmax loss
-lenet = mx.SoftmaxOutput(fc2, name=:softmax)
-
-
-#--------------------------------------------------------------------------------
-# load data
-batch_size = 100
-include("mnist-data.jl")
-train_provider, eval_provider = get_mnist_providers(batch_size; flat=false)
-
-#--------------------------------------------------------------------------------
-# fit model
-model = mx.FeedForward(lenet, context=mx.gpu())
-
-# optimizer
-optimizer = mx.SGD(η=0.05, μ=0.9, λ=0.00001)
-
-# fit parameters
-mx.fit(model, optimizer, train_provider, n_epoch=20, eval_data=eval_provider)
diff --git a/julia/examples/mnist/mlp-test.jl b/julia/examples/mnist/mlp-test.jl
deleted file mode 100644
index 7a24f9281652..000000000000
--- a/julia/examples/mnist/mlp-test.jl
+++ /dev/null
@@ -1,122 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# This file is primarily to be included from runtest.jl. We tried to cover various
-# features of MXNet.jl in this example in order to detect regression errors.
-
-module MNISTTest
-
-using MXNet
-using Test
-
-include("mnist-data.jl")
-
-function get_mnist_mlp()
-  @mx.chain mx.Variable(:data)                   =>
-    mx.FullyConnected(name=:fc1, num_hidden=128) =>
-    mx.Activation(name=:relu1, act_type=:relu)   =>
-    mx.FullyConnected(name=:fc2, num_hidden=64)  =>
-    mx.Activation(name=:relu2, act_type=:relu)   =>
-    mx.FullyConnected(name=:fc3, num_hidden=10)  =>
-    mx.SoftmaxOutput(name=:softmax)
-end
-
-get_mnist_data(batch_size = 100) = get_mnist_providers(batch_size)
-
-function mnist_fit_and_predict(optimizer, initializer, n_epoch)
-  mlp = get_mnist_mlp()
-  train_provider, eval_provider = get_mnist_data()
-
-  # setup model
-  model = mx.FeedForward(mlp, context = mx.cpu())
-
-  # fit parameters
-  cp_prefix = "mnist-test-cp"
-  mx.fit(model, optimizer, train_provider, eval_data=eval_provider, n_epoch=n_epoch,
-         initializer=initializer, callbacks=[mx.speedometer(), mx.do_checkpoint(cp_prefix, save_epoch_0=true)])
-
-  # make sure the checkpoints are saved
-  @test isfile("$cp_prefix-symbol.json")
-  for i_epoch = 0:n_epoch
-    @test isfile(mx.format("{1}-{2:04d}.params", cp_prefix, i_epoch))
-  end
-  mlp_load = mx.load("$cp_prefix-symbol.json", mx.SymbolicNode)
-  @test mx.to_json(mlp_load) == mx.to_json(mlp)
-  mlp_load = mx.from_json(read("$cp_prefix-symbol.json", String), mx.SymbolicNode)
-  @test mx.to_json(mlp_load) == mx.to_json(mlp)
-
-  #--------------------------------------------------------------------------------
-  # the predict API
-  probs = mx.predict(model, eval_provider)
-
-  # collect all labels from eval data
-  labels = Array[]
-  for batch in eval_provider
-    push!(labels, copy(mx.get(eval_provider, batch, :softmax_label)))
-  end
-  labels = cat(labels..., dims = 1)
-
-  # Now we use compute the accuracy
-  correct = 0
-  for i = 1:length(labels)
-    # labels are 0...9
-    if argmax(probs[:,i]) == labels[i]+1
-      correct += 1
-    end
-  end
-  accuracy = 100correct/length(labels)
-  println(mx.format("Accuracy on eval set: {1:.2f}%", accuracy))
-
-  # try to call visualization
-  dot_code = mx.to_graphviz(mlp)
-
-  return accuracy
-end
-
-function test_mnist_mlp()
-  @info("MNIST::SGD")
-  @test mnist_fit_and_predict(mx.SGD(η=.2), mx.UniformInitializer(.01), 2) > 90
-
-  @info("MNIST::SGD::η scheduler")
-  @test mnist_fit_and_predict(mx.SGD(η_sched=mx.LearningRate.Inv(.25)),
-                              mx.UniformInitializer(.01), 2) > 90
-
-  @info("MNIST::SGD::momentum μ")
-  @test mnist_fit_and_predict(mx.SGD(η=.1, μ=.9), mx.UniformInitializer(.01), 2) > 90
-
-  @info("MNIST::ADAM")
-  @test mnist_fit_and_predict(mx.ADAM(), mx.NormalInitializer(), 2) > 90
-
-  @info("MNIST::AdaGrad")
-  @test mnist_fit_and_predict(mx.AdaGrad(), mx.NormalInitializer(), 2) > 90
-
-  @info("MNIST::AdaDelta")
-  @test mnist_fit_and_predict(mx.AdaDelta(), mx.NormalInitializer(), 2) > 90
-
-  @info("MNIST::AdaMax")
-  @test mnist_fit_and_predict(mx.AdaMax(), mx.NormalInitializer(), 2) > 90
-
-  @info("MNIST::RMSProp")
-  @test mnist_fit_and_predict(mx.RMSProp(), mx.NormalInitializer(), 2) > 90
-
-  @info("MNIST::Nadam")
-  @test mnist_fit_and_predict(mx.Nadam(), mx.NormalInitializer(), 2) > 90
-end
-
-test_mnist_mlp()
-
-end # module MNISTTest
diff --git a/julia/examples/mnist/mlp.jl b/julia/examples/mnist/mlp.jl
deleted file mode 100644
index 86111ee68172..000000000000
--- a/julia/examples/mnist/mlp.jl
+++ /dev/null
@@ -1,75 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-using MXNet
-
-#--------------------------------------------------------------------------------
-# define MLP
-# the following two ways are equivalent
-
-#-- Option 1: explicit composition
-# data = mx.Variable(:data)
-# fc1  = mx.FullyConnected(data, name=:fc1, num_hidden=128)
-# act1 = mx.Activation(fc1, name=:relu1, act_type=:relu)
-# fc2  = mx.FullyConnected(act1, name=:fc2, num_hidden=64)
-# act2 = mx.Activation(fc2, name=:relu2, act_type=:relu)
-# fc3  = mx.FullyConnected(act2, name=:fc3, num_hidden=10)
-# mlp  = mx.SoftmaxOutput(fc3, name=:softmax)
-
-#-- Option 2: using the mx.chain macro
-# mlp = @mx.chain mx.Variable(:data)             =>
-#   mx.FullyConnected(name=:fc1, num_hidden=128) =>
-#   mx.Activation(name=:relu1, act_type=:relu)   =>
-#   mx.FullyConnected(name=:fc2, num_hidden=64)  =>
-#   mx.Activation(name=:relu2, act_type=:relu)   =>
-#   mx.FullyConnected(name=:fc3, num_hidden=10)  =>
-#   mx.SoftmaxOutput(name=:softmax)
-
-#-- Option 3: using nn-factory
-mlp = @mx.chain mx.Variable(:data) =>
-  mx.MLP([128, 64, 10])            =>
-  mx.SoftmaxOutput(name=:softmax)
-
-# data provider
-batch_size = 100
-include("mnist-data.jl")
-train_provider, eval_provider = get_mnist_providers(batch_size)
-
-# setup model
-model = mx.FeedForward(mlp, context=mx.cpu())
-
-# optimizer
-optimizer = mx.SGD(η=0.1, μ=0.9, λ=0.00001)
-
-# fit parameters
-mx.fit(model, optimizer, train_provider, eval_data=eval_provider, n_epoch=20)
-
-#--------------------------------------------------------------------------------
-# Optional, demonstration of the predict API
-probs = mx.predict(model, eval_provider)
-
-# collect all labels from eval data
-labels = reduce(
-  vcat,
-  copy(mx.get(eval_provider, batch, :softmax_label)) for batch ∈ eval_provider)
-# labels are 0...9
-labels .= labels .+ 1
-
-# Now we use compute the accuracy
-pred = map(i -> argmax(probs[1:10, i]), 1:size(probs, 2))
-correct = sum(pred .== labels)
-@printf "Accuracy on eval set: %.2f%%\n" 100correct/length(labels)
diff --git a/julia/examples/mnist/mnist-data.jl b/julia/examples/mnist/mnist-data.jl
deleted file mode 100644
index 12160cf6f18e..000000000000
--- a/julia/examples/mnist/mnist-data.jl
+++ /dev/null
@@ -1,33 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-function get_mnist_providers(batch_size::Int; data_name=:data, label_name=:softmax_label, flat=true)
-  # download MNIST into Pkg.dir("MXNet")/data/mnist if not exist
-  filenames = mx.get_mnist_ubyte()
-
-  # data provider
-  train_provider = mx.MNISTProvider(image=filenames[:train_data],
-                                    label=filenames[:train_label],
-                                    data_name=data_name, label_name=label_name,
-                                    batch_size=batch_size, shuffle=true, flat=flat, silent=true)
-  eval_provider = mx.MNISTProvider(image=filenames[:test_data],
-                                   label=filenames[:test_label],
-                                   data_name=data_name, label_name=label_name,
-                                   batch_size=batch_size, shuffle=false, flat=flat, silent=true)
-
-  return (train_provider, eval_provider)
-end
diff --git a/julia/examples/nondefault-example.jl b/julia/examples/nondefault-example.jl
deleted file mode 100644
index 75eff085a459..000000000000
--- a/julia/examples/nondefault-example.jl
+++ /dev/null
@@ -1,147 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-#=
-    Contents: This file contains code for:
-              - Setting the initial values of the biases and weights equal to the final values of a previous run.
-	        This is helpful for re-estimating a model on updated training data, where the original and updated training data largely overlap.
-	      - Changing the loss function (in our example from Accuracy to ACE)
-
-    Notes:
-    1. The model is a toy example with 4 outcomes (categories).
-       The model is a poor fit to the data, but this is unimportant. The point of the example is to demonstrate the use of some non-default settings.
-    2. For categorical outcomes, use 0-based categories! Some of the loss functions assume this, such as ACE.
-    3. Incomplete batches are padded with repeated instances of an artificial observation.
-       This is bad because the artificial data is over-represented and thus biases the results.
-       The ideal solution is to distribute the observations from the incomplete batch among the complete batches.
-       This would result in batches of variable but similar size, and thus the estimate of the gradient would not be significantly affected.
-       But this doesn't happen.
-       For simplicity we instead drop these extra observations, so that the number of observations in the data set is a multiple of the batch_size.
-=#
-
-
-using RDatasets
-using MXNet
-
-
-################################################################################
-### Data: Exam scores discretised into 4 categories (use zero-based categories!).
-df = dataset("mlmRev", "Gcsemv");    # 1905 x 5
-complete_cases!(df)                  # 1523 x 5
-n = nrow(df)
-df[:written] = zeros(Int, n)
-df[:course]  = zeros(Int, n)
-for i = 1:n
-    # Categorise :Written
-    if df[i, :Written] <= 20.0
-	df[i, :written] = 0
-    elseif df[i, :Written] <= 40.0
-	df[i, :written] = 1
-    elseif df[i, :Written] <= 60.0
-	df[i, :written] = 2
-    else
-	df[i, :written] = 3
-    end
-
-    # Categorise :Course
-    if df[i, :Course] <= 25.0
-	df[i, :course] = 0
-    elseif df[i, :Course] <= 50.0
-	df[i, :course] = 1
-    elseif df[i, :Course] <= 75.0
-	df[i, :course] = 2
-    else
-	df[i, :course] = 3
-    end
-end
-df = df[1:1500, :]    # Ensure nrows is a multiple of batch_size (100 in our example, see below)
-
-x = convert(Vector{Float64}, df[:course])
-y = convert(Vector{Float64}, df[:written])
-
-
-################################################################################
-### Hyperparameters
-
-# Architecture
-mlp = @mx.chain mx.Variable(:data) =>
-        mx.FullyConnected(name = :h1, num_hidden = 10) =>
-	mx.Activation(name = :h1_out, act_type = :sigmoid) =>
-        mx.FullyConnected(name = :out, num_hidden = 4) =>
-	mx.SoftmaxOutput(name = :softmax)
-
-# Hyperparameters
-n_epoch    = 100
-batch_size = 100
-learn_rate = 0.1
-mom        = 0.9
-wt_decay   = 0.00001
-
-
-# Connect data, network architecture and hyperparameters
-train_prov = mx.ArrayDataProvider(x, y; batch_size = batch_size)
-eval_prov  = mx.ArrayDataProvider(x, y; batch_size = batch_size)
-opt        = mx.SGD(lr = learn_rate, momentum = mom, weight_decay = wt_decay)    # Optimizing algorithm
-
-################################################################################
-### Run 1: Basic run, storing initial and final state.
-
-# Learn
-mdl1 = mx.FeedForward(mlp, context = mx.cpu())                                               # Model targets the local CPU
-cb = mx.do_checkpoint("first", frequency = n_epoch, save_epoch_0 = true)                     # Write initial and final states to disk
-mx.fit(mdl1, opt, train_prov, n_epoch = n_epoch, eval_data = eval_prov, callbacks = [cb])    # Random initial biases and weights
-
-
-################################################################################
-### Run 2: Load the previously trained model and run it some more, starting where Run 1 finished.
-
-# Load final state of 1st run from disk
-arch, arg_params, aux_params = mx.load_checkpoint("first", 100)    # arch is the network structure, arg_params contains the weights and biases
-mdl2 = mx.FeedForward(arch, context = mx.cpu())                    # Only populates the arch and ctx fields
-mdl2.arg_params = arg_params                                       # Populate the arg_params fields
-cb   = mx.do_checkpoint("second", frequency = n_epoch, save_epoch_0 = true)
-mx.fit(mdl2, opt, train_prov, n_epoch = n_epoch, eval_data = eval_prov, callbacks = [cb])
-
-# Test whether the final state of 1st run equals the initial state of 2nd run
-run(`diff first-0100.params second-0000.params`)    # Throws error if not true, does nothing otherwise
-
-
-#=
-    # Other useful functions
-    arch       = mx.load("first-symbol.json", mx.SymbolicNode)
-    arg_params = mx.load("first-0100.params", mx.NDArray)
-=#
-
-
-################################################################################
-### Run 3: Change the loss function from the default Accuracy to ACE
-
-mdl3 = mx.FeedForward(mlp, context = mx.cpu())
-mx.fit(mdl3, opt, train_prov, n_epoch = n_epoch, eval_data = eval_prov, eval_metric = mx.ACE())
-#mx.fit(mdl3, opt, train_prov, n_epoch = n_epoch, eval_data = eval_prov, eval_metric = mx.Accuracy())    # Default eval_metric
-#mx.fit(mdl3, opt, train_prov, n_epoch = n_epoch, eval_data = eval_prov, eval_metric = mx.MultiACE(4))
-
-# Test manually
-probs = mx.predict(mdl3, eval_prov)
-LL    = 0.0
-for i = 1:size(y, 1)
-    LL += log(probs[Int(y[i]) + 1, i])
-end
--LL / size(y, 1)    # Should equal the value of ACE from the final iteration of fit(mdl3, ...)
-
-
-# EOF
diff --git a/julia/examples/regression-example.jl b/julia/examples/regression-example.jl
deleted file mode 100644
index bbbb415fe664..000000000000
--- a/julia/examples/regression-example.jl
+++ /dev/null
@@ -1,101 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-#=
-This script shows how a simple MLP net may be used
-for regression. It shows how data in memory may be
-used for training and evaluation, and how to obtain
-the predictions from the trained net.
-=#
-using MXNet
-using Distributions
-#using Plots
-
-# data generating process
-generate_inputs(mean, var, size) = rand(MvNormal(mean, var), size)
-output(data) = sin.(data[1:1,:]).*sin.(data[2:2,:])./(data[1:1,:].*data[2:2,:])
-
-# create training and evaluation data sets
-mean=[0.0; 0.0]
-var=[1.0 0.0; 0.0 1.0]
-samplesize  = 5000
-TrainInput = generate_inputs(mean, var, samplesize)
-TrainOutput = output(TrainInput)
-ValidationInput = generate_inputs(mean, var, samplesize)
-ValidationOutput = output(ValidationInput)
-
-# how to set up data providers using data in memory
-function data_source(batchsize = 100)
-  train = mx.ArrayDataProvider(
-    :data => TrainInput,
-    :label => TrainOutput,
-    batch_size = batchsize,
-    shuffle = true,
-    )
-  valid = mx.ArrayDataProvider(
-    :data => ValidationInput,
-    :label => ValidationOutput,
-    batch_size = batchsize,
-    shuffle = true,
-    )
-
-  train, valid
-end
-
-# create a two hidden layer MPL: try varying num_hidden, and change tanh to relu,
-# or add/remove a layer
-data = mx.Variable(:data)
-label = mx.Variable(:label)
-net = @mx.chain     mx.Variable(:data) =>
-                    mx.FullyConnected(num_hidden=10) =>
-                    mx.Activation(act_type=:tanh) =>
-                    mx.FullyConnected(num_hidden=3) =>
-                    mx.Activation(act_type=:tanh) =>
-                    mx.FullyConnected(num_hidden=1) =>
-                    mx.LinearRegressionOutput(mx.Variable(:label))
-
-# final model definition, don't change, except if using gpu
-model = mx.FeedForward(net, context=mx.cpu())
-
-# set up the optimizer: select one, explore parameters, if desired
-#optimizer = mx.SGD(η=0.01, μ=0.9, λ=0.00001)
-optimizer = mx.ADAM()
-
-# train, reporting loss for training and evaluation sets
-# initial training with small batch size, to get to a good neighborhood
-trainprovider, evalprovider = data_source(#= batchsize =# 200)
-mx.fit(model, optimizer, trainprovider,
-       initializer = mx.NormalInitializer(0.0, 0.1),
-       eval_metric = mx.MSE(),
-       eval_data = evalprovider,
-       n_epoch = 20,
-       callbacks = [mx.speedometer()])
-# more training with the full sample
-trainprovider, evalprovider = data_source(#= batchsize =# samplesize)
-mx.fit(model, optimizer, trainprovider,
-       initializer = mx.NormalInitializer(0.0, 0.1),
-       eval_metric = mx.MSE(),
-       eval_data = evalprovider,
-       n_epoch = 500,  # previous setting is batchsize = 200, epoch = 20
-                       # implies we did (5000 / 200) * 20 times update in previous `fit`
-       callbacks = [mx.speedometer()])
-
-# obtain predictions
-plotprovider = mx.ArrayDataProvider(:data => ValidationInput, :label => ValidationOutput)
-fit = mx.predict(model, plotprovider)
-println("correlation between fitted values and true regression line: ", cor(vec(fit), vec(ValidationOutput)))
-#scatter(ValidationOutput',fit',w = 3, xlabel="true", ylabel="predicted", title="45º line is what we hope for", show=true)
diff --git a/julia/test/runtests.jl b/julia/test/runtests.jl
index e75df67ad8a4..e30b68ac3e6f 100644
--- a/julia/test/runtests.jl
+++ b/julia/test/runtests.jl
@@ -37,11 +37,4 @@ const BASEDIR = joinpath(@__DIR__, "..")
 include(joinpath(@__DIR__, "common.jl"))
 @testset "MXNet Test" begin
   test_dir(joinpath(@__DIR__, "unittest"))
-
-  # run the basic MNIST mlp example
-  if haskey(ENV, "INTEGRATION_TEST")
-    @testset "MNIST Test" begin
-      include(joinpath(BASEDIR, "examples", "mnist", "mlp-test.jl"))
-    end
-  end
 end
diff --git a/julia/test/unittest/symbolic-node.jl b/julia/test/unittest/symbolic-node.jl
index 07ef05f704db..69c852f6f843 100644
--- a/julia/test/unittest/symbolic-node.jl
+++ b/julia/test/unittest/symbolic-node.jl
@@ -34,28 +34,6 @@ function test_basic()
   @test mx.list_auxiliary_states(model) == Symbol[]
 end
 
-function test_chain()
-  @info("SymbolicNode::chain")
-
-  model = mlpchain()
-  @test mx.list_arguments(model) == [:data,:fc1_weight,:fc1_bias,:fc2_weight,:fc2_bias]
-  @test mx.list_outputs(model) == [:fc2_output]
-  @test mx.list_auxiliary_states(model) == Symbol[]
-
-  let layerconfig = [20, 10, 6]
-    model = @mx.chain mx.Variable(:data) =>
-      mx.MLP(layerconfig, prefix=:magic_) =>
-      mx.LinearRegressionOutput(mx.Variable(:label))
-
-    @test mx.list_arguments(model) == [
-      :data,
-      :magic_fc1_weight, :magic_fc1_bias,
-      :magic_fc2_weight, :magic_fc2_bias,
-      :magic_fc3_weight, :magic_fc3_bias,
-      :label]
-  end
-end
-
 function test_internal()
   @info("SymbolicNode::internal")
 
@@ -541,7 +519,6 @@ end  # test_var
 ################################################################################
 @testset "SymbolicNode Test" begin
   test_basic()
-  test_chain()
   test_internal()
   test_compose()
   test_infer_shape()
diff --git a/perl-package/AI-MXNet/MANIFEST b/perl-package/AI-MXNet/MANIFEST
index 763d7186d0c9..fef158689f2a 100644
--- a/perl-package/AI-MXNet/MANIFEST
+++ b/perl-package/AI-MXNet/MANIFEST
@@ -1,8 +1,4 @@
 Changes
-examples/calculator.pl
-examples/char_lstm.pl
-examples/cudnn_lstm_bucketing.pl
-examples/get_sherlockholmes_data.sh
 examples/gluon/dcgan.pl
 examples/gluon/mnist.pl
 examples/gluon/style_transfer/get_data.sh
@@ -10,15 +6,6 @@ examples/gluon/style_transfer/net.pl
 examples/gluon/style_transfer/README.md
 examples/gluon/style_transfer/style_transfer.pl
 examples/gluon/style_transfer/utils.pl
-examples/lstm_bucketing.pl
-examples/mnist.pl
-examples/plot_network.pl
-examples/sparse/matrix_factorization/get_data.sh
-examples/sparse/matrix_factorization/README.md
-examples/sparse/matrix_factorization/train.pl
-examples/sparse/wide_deep/get_data.sh
-examples/sparse/wide_deep/README.md
-examples/sparse/wide_deep/train.pl
 lib/AI/MXNet.pm
 lib/AI/MXNet/AutoGrad.pm
 lib/AI/MXNet/AutoLoad.pm
@@ -104,7 +91,6 @@ t/AI-MXNet.t
 t/test_attr.t
 t/test_autograd.t
 t/test_base.t
-t/test_conv.t
 t/test_cuda_module.t
 t/test_engine.t
 t/test_executor.t
@@ -121,7 +107,6 @@ t/test_loss.t
 t/test_metric.t
 t/test_model_parallel.t
 t/test_module.t
-t/test_multi_device_exec.t
 t/test_ndarray.t
 t/test_optimizers.t
 t/test_random.t
diff --git a/perl-package/AI-MXNet/examples/calculator.pl b/perl-package/AI-MXNet/examples/calculator.pl
deleted file mode 100755
index 0350536c730f..000000000000
--- a/perl-package/AI-MXNet/examples/calculator.pl
+++ /dev/null
@@ -1,156 +0,0 @@
-#!/usr/bin/env perl
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet ('mx');
-
-## preparing the samples
-## to train our network
-sub samples {
-    my($batch_size, $func) = @_;
-    # get samples
-    my $n = 16384;
-    ## creates a pdl with $n rows and two columns with random
-    ## floats in the range between 0 and 1
-    my $data = PDL->random(2, $n);
-    ## creates the pdl with $n rows and one column with labels
-    ## labels are floats that either sum or product, etc of
-    ## two random values in each corresponding row of the data pdl
-    my $label = $func->($data->slice('0,:'), $data->slice('1,:'));
-    # partition into train/eval sets
-    my $edge = int($n / 8);
-    my $validation_data = $data->slice(":,0:@{[ $edge - 1 ]}");
-    my $validation_label = $label->slice(":,0:@{[ $edge - 1 ]}");
-    my $train_data = $data->slice(":,$edge:");
-    my $train_label = $label->slice(":,$edge:");
-    # build iterators around the sets
-    return(mx->io->NDArrayIter(
-        batch_size => $batch_size,
-        data => $train_data,
-        label => $train_label,
-    ), mx->io->NDArrayIter(
-        batch_size => $batch_size,
-        data => $validation_data,
-        label => $validation_label,
-    ));
-}
-
-## the network model
-sub nn_fc {
-    my $data = mx->sym->Variable('data');
-    my $ln = mx->sym->exp(mx->sym->FullyConnected(
-        data => mx->sym->log($data),
-        num_hidden => 1,
-    ));
-    my $wide = mx->sym->Concat($data, $ln);
-    my $fc = mx->sym->FullyConnected(
-	$wide,
-	num_hidden => 1
-    );
-    return mx->sym->MAERegressionOutput(data => $fc, name => 'softmax');
-}
-
-sub learn_function {
-    my(%args) = @_;
-    my $func = $args{func};
-    my $batch_size = $args{batch_size}//128;
-    my($train_iter, $eval_iter) = samples($batch_size, $func);
-    my $sym = nn_fc();
-
-    ## call as ./calculator.pl 1 to just print model and exit
-    if($ARGV[0]) {
-        my @dsz = @{$train_iter->data->[0][1]->shape};
-        my @lsz = @{$train_iter->label->[0][1]->shape};
-        my $shape = {
-            data          => [ $batch_size, splice @dsz,  1 ],
-            softmax_label => [ $batch_size, splice @lsz, 1 ],
-        };
-        print mx->viz->plot_network($sym, shape => $shape)->graph->as_png;
-        exit;
-    }
-
-    my $model = mx->mod->Module(
-        symbol => $sym,
-        context => mx->cpu(),
-    );
-    $model->fit($train_iter,
-        eval_data => $eval_iter,
-        optimizer => 'adam',
-        optimizer_params => {
-            learning_rate => $args{lr}//0.01,
-            rescale_grad => 1/$batch_size,
-            lr_scheduler  => AI::MXNet::FactorScheduler->new(
-        	step => 100,
-        	factor => 0.99
-            )
-        },
-        eval_metric => 'mse',
-        num_epoch => $args{epoch}//25,
-    );
-
-    # refit the model for calling on 1 sample at a time
-    my $iter = mx->io->NDArrayIter(
-        batch_size => 1,
-        data => PDL->pdl([[ 0, 0 ]]),
-        label => PDL->pdl([[ 0 ]]),
-    );
-    $model->reshape(
-        data_shapes => $iter->provide_data,
-        label_shapes => $iter->provide_label,
-    );
-
-    # wrap a helper around making predictions
-    my ($arg_params) = $model->get_params;
-    for my $k (sort keys %$arg_params)
-    {
-	print "$k -> ". $arg_params->{$k}->aspdl."\n";
-    }
-    return sub {
-        my($n, $m) = @_;
-        return $model->predict(mx->io->NDArrayIter(
-            batch_size => 1,
-            data => PDL->new([[ $n, $m ]]),
-        ))->aspdl->list;
-    };
-}
-
-my $add = learn_function(func => sub {
-    my($n, $m) = @_;
-    return $n + $m;
-});
-my $sub = learn_function(func => sub {
-    my($n, $m) = @_;
-    return $n - $m;
-}, batch_size => 50, epoch => 40);
-my $mul = learn_function(func => sub {
-    my($n, $m) = @_;
-    return $n * $m;
-}, batch_size => 50, epoch => 40);
-my $div = learn_function(func => sub {
-    my($n, $m) = @_;
-    return $n / $m;
-}, batch_size => 10, epoch => 80);
-
-
-print "12345 + 54321 ≈ ", $add->(12345, 54321), "\n";
-print "188 - 88 ≈ ", $sub->(188, 88), "\n";
-print "250 * 2 ≈ ", $mul->(250, 2), "\n";
-print "250 / 2 ≈ ", $div->(250, 2), "\n";
-
diff --git a/perl-package/AI-MXNet/examples/char_lstm.pl b/perl-package/AI-MXNet/examples/char_lstm.pl
deleted file mode 100755
index a8bf72599797..000000000000
--- a/perl-package/AI-MXNet/examples/char_lstm.pl
+++ /dev/null
@@ -1,255 +0,0 @@
-#!/usr/bin/env perl
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use PDL;
-use Math::Random::Discrete;
-use AI::MXNet qw(mx);
-use AI::MXNet::Function::Parameters;
-use Getopt::Long qw(HelpMessage);
-
-GetOptions(
-    'num-layers=i'   => \(my $num_layers   = 2       ),
-    'num-hidden=i'   => \(my $num_hidden   = 256     ),
-    'num-embed=i'    => \(my $num_embed    = 10      ),
-    'num-seq=i'      => \(my $seq_size     = 60      ),
-    'gpus=s'         => \(my $gpus                   ),
-    'kv-store=s'     => \(my $kv_store     = 'device'),
-    'num-epoch=i'    => \(my $num_epoch    = 25      ),
-    'lr=f'           => \(my $lr           = 0.001    ),
-    'optimizer=s'    => \(my $optimizer    = 'adam'   ),
-    'mom=f'          => \(my $mom          = 0       ),
-    'wd=f'           => \(my $wd           = 0.00001 ),
-    'batch-size=i'   => \(my $batch_size   = 32      ),
-    'disp-batches=i' => \(my $disp_batches = 50      ),
-    'chkp-prefix=s'  => \(my $chkp_prefix  = 'lstm_' ),
-    'cell-mode=s'    => \(my $cell_mode    = 'LSTM'  ),
-    'sample-size=i'  => \(my $sample_size  = 10000   ),
-    'chkp-epoch=i'   => \(my $chkp_epoch   = 1       ),
-    'bidirectional=i'=> \(my $bidirectional= 0       ),
-    'help'           => sub { HelpMessage(0) },
-) or HelpMessage(1);
-
-=head1 NAME
-
-    char_lstm.pl - Example of training char LSTM RNN on tiny shakespeare using high level RNN interface
-                   with optional inferred sampling (RNN generates Shakespeare like text)
-
-=head1 SYNOPSIS
-
-    --num-layers     number of stacked RNN layers, default=2
-    --num-hidden     hidden layer size, default=256
-    --num-embed      embed size, default=10
-    --num-seq        sequence size, default=60
-    --gpus           list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.
-                     Increase batch size when using multiple gpus for best performance.
-    --kv-store       key-value store type, default='device'
-    --num-epochs     max num of epochs, default=25
-    --lr             initial learning rate, default=0.01
-    --optimizer      the optimizer type, default='adam'
-    --mom            momentum for sgd, default=0.0
-    --wd             weight decay for sgd, default=0.00001
-    --batch-size     the batch size type, default=32
-    --bidirectional  use bidirectional cell, default false (0)
-    --disp-batches   show progress for every n batches, default=50
-    --chkp-prefix    prefix for checkpoint files, default='lstm_'
-    --cell-mode      RNN cell mode (LSTM, GRU, RNN, default=LSTM)
-    --sample-size    a size of inferred sample text (default=10000) after each epoch
-    --chkp-epoch     save checkpoint after this many epoch, default=1 (saving every checkpoint)
-
-=cut
-
-package AI::MXNet::RNN::IO::ASCIIIterator;
-use Mouse;
-extends AI::MXNet::DataIter;
-has 'data'          => (is => 'ro',  isa => 'PDL',   required => 1);
-has 'seq_size'      => (is => 'ro',  isa => 'Int',   required => 1);
-has '+batch_size'   => (is => 'ro',  isa => 'Int',   required => 1);
-has 'data_name'     => (is => 'ro',  isa => 'Str',   default => 'data');
-has 'label_name'    => (is => 'ro',  isa => 'Str',   default => 'softmax_label');
-has 'dtype'         => (is => 'ro',  isa => 'Dtype', default => 'float32');
-has [qw/nd counter seq_counter vocab_size
-    data_size provide_data provide_label idx/] => (is => 'rw', init_arg => undef);
-
-sub BUILD
-{
-    my $self = shift;
-    $self->data_size($self->data->nelem);
-    my $segments = int(($self->data_size-$self->seq_size)/($self->batch_size*$self->seq_size));
-    $self->idx([0..$segments-1]);
-    $self->vocab_size($self->data->uniq->shape->at(0));
-    $self->counter(0);
-    $self->seq_counter(0);
-    $self->nd(mx->nd->array($self->data, dtype => $self->dtype));
-    my $shape = [$self->batch_size, $self->seq_size];
-    $self->provide_data([
-        AI::MXNet::DataDesc->new(
-            name  => $self->data_name,
-            shape => $shape,
-            dtype => $self->dtype
-        )
-    ]);
-    $self->provide_label([
-        AI::MXNet::DataDesc->new(
-            name  => $self->label_name,
-            shape => $shape,
-            dtype => $self->dtype
-        )
-    ]);
-    $self->reset;
-}
-
-method reset()
-{
-    $self->counter(0);
-    @{ $self->idx } = List::Util::shuffle(@{ $self->idx });
-}
-
-method next()
-{
-    return undef if $self->counter == @{$self->idx};
-    my $offset = $self->idx->[$self->counter]*$self->batch_size*$self->seq_size + $self->seq_counter;
-    my $data = $self->nd->slice(
-        [$offset, $offset + $self->batch_size*$self->seq_size-1]
-    )->reshape([$self->batch_size, $self->seq_size]);
-    my $label = $self->nd->slice(
-        [$offset + 1 , $offset + $self->batch_size*$self->seq_size]
-    )->reshape([$self->batch_size, $self->seq_size]);
-    $self->seq_counter($self->seq_counter + 1);
-    if($self->seq_counter == $self->seq_size - 1)
-    {
-        $self->counter($self->counter + 1);
-        $self->seq_counter(0);
-    }
-    return AI::MXNet::DataBatch->new(
-        data          => [$data],
-        label         => [$label],
-        provide_data  => [
-            AI::MXNet::DataDesc->new(
-                name  => $self->data_name,
-                shape => $data->shape,
-                dtype => $self->dtype
-            )
-        ],
-        provide_label => [
-            AI::MXNet::DataDesc->new(
-                name  => $self->label_name,
-                shape => $label->shape,
-                dtype => $self->dtype
-            )
-        ],
-    );
-}
-
-package main;
-my $file = "data/input.txt";
-open(F, $file) or die "can't open $file: $!";
-my $fdata;
-{ local($/) = undef; $fdata = <F>; close(F) };
-my %vocabulary; my $i = 0;
-$fdata = pdl(map{ exists $vocabulary{$_} ? $vocabulary{$_} : ($vocabulary{$_} = $i++) } split(//, $fdata));
-my $data_iter = AI::MXNet::RNN::IO::ASCIIIterator->new(
-    batch_size => $batch_size,
-    data       => $fdata,
-    seq_size   => $seq_size
-);
-my %reverse_vocab = reverse %vocabulary;
-my $mode = "${cell_mode}Cell";
-my $stack = mx->rnn->SequentialRNNCell();
-for my $i (0..$num_layers-1)
-{
-    my $cell = mx->rnn->$mode(num_hidden => $num_hidden, prefix => "lstm_${i}l0_");
-    if($bidirectional)
-    {
-        $cell = mx->rnn->BidirectionalCell(
-            $cell,
-            mx->rnn->$mode(
-                num_hidden => $num_hidden,
-                prefix => "lstm_${i}r0_"
-            ),
-            output_prefix => "bi_lstm_$i"
-        );
-    }
-    $stack->add($cell);
-}
-
-my $data  = mx->sym->Variable('data');
-my $label = mx->sym->Variable('softmax_label');
-my $embed = mx->sym->Embedding(
-        data => $data, input_dim => scalar(keys %vocabulary),
-        output_dim => $num_embed, name => 'embed'
-);
-$stack->reset;
-my ($outputs, $states) = $stack->unroll($seq_size, inputs => $embed, merge_outputs => 1);
-my $pred  = mx->sym->Reshape($outputs, shape => [-1, $num_hidden*(1+($bidirectional ? 1 : 0))]);
-$pred     = mx->sym->FullyConnected(data => $pred, num_hidden => $data_iter->vocab_size, name => 'pred');
-$label    = mx->sym->Reshape($label, shape => [-1]);
-my $net   = mx->sym->SoftmaxOutput(data => $pred, label => $label, name => 'softmax');
-
-my $contexts;
-if(defined $gpus)
-{
-    $contexts = [map { mx->gpu($_) } split(/,/, $gpus)];
-}
-else
-{
-    $contexts = mx->cpu(0);
-}
-
-my $model = mx->mod->Module(
-    symbol  => $net,
-    context => $contexts
-);
-$model->fit(
-    $data_iter,
-    eval_metric         => mx->metric->Perplexity,
-    kvstore             => $kv_store,
-    optimizer           => $optimizer,
-    optimizer_params    => {
-                                learning_rate => $lr,
-                                momentum      => $mom,
-                                wd            => $wd,
-                                clip_gradient => 5,
-                                rescale_grad  => 1/$batch_size,
-                                lr_scheduler  => AI::MXNet::FactorScheduler->new(step => 1000, factor => 0.99)
-                        },
-    initializer         => mx->init->Xavier(factor_type => "in", magnitude => 2.34),
-    num_epoch           => $num_epoch,
-    batch_end_callback  => mx->callback->Speedometer($batch_size, $disp_batches),
-    ($chkp_epoch ? (epoch_end_callback  => [mx->callback->module_checkpoint($model, $chkp_prefix, $chkp_epoch), \&sample]) : ())
-);
-
-my $chkp = 1;
-sub sample {
-    return if not $sample_size;
-    my $inference_model = mx->mod->Module->load($chkp_prefix, $chkp++);
-    $inference_model->bind(data_shapes=>[['data',[1, $seq_size]]], label_shapes=>[['softmax_label',[1, $seq_size]]]);
-    my $input = mx->nd->array($fdata->slice([0, $seq_size-1]))->reshape([1, $seq_size]);
-    $| = 1;
-    for (0..$sample_size-1)
-    {
-        $inference_model->forward(mx->io->DataBatch(data=>[$input]), is_train => 0);
-        my $prob = $inference_model->get_outputs(0)->[0][0]->at($seq_size-1)->aspdl;
-        my $next_char = Math::Random::Discrete->new($prob->reshape(-1)->unpdl, [0..scalar(keys %vocabulary)-1])->rand;
-        print "$reverse_vocab{$next_char}";
-        $input->at(0)->slice([0, $seq_size-2]) .= $input->at(0)->slice([1, $seq_size-1])->copy;
-        $input->at(0)->at($seq_size-1) .= $next_char;
-    }
-}
diff --git a/perl-package/AI-MXNet/examples/cudnn_lstm_bucketing.pl b/perl-package/AI-MXNet/examples/cudnn_lstm_bucketing.pl
deleted file mode 100755
index 53200f3095c0..000000000000
--- a/perl-package/AI-MXNet/examples/cudnn_lstm_bucketing.pl
+++ /dev/null
@@ -1,301 +0,0 @@
-#!/usr/bin/env perl
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet qw(mx);
-use AI::MXNet::Function::Parameters;
-use AI::MXNet::Base;
-use Getopt::Long qw(HelpMessage);
-
-GetOptions(
-    'test'            => \(my $do_test                ),
-    'num-layers=i'    => \(my $num_layers   = 2       ),
-    'num-hidden=i'    => \(my $num_hidden   = 256     ),
-    'num-embed=i'     => \(my $num_embed    = 256     ),
-    'num-seq=i'       => \(my $seq_size     = 32      ),
-    'gpus=s'          => \(my $gpus                   ),
-    'kv-store=s'      => \(my $kv_store     = 'device'),
-    'num-epoch=i'     => \(my $num_epoch    = 25      ),
-    'lr=f'            => \(my $lr           = 0.01    ),
-    'optimizer=s'     => \(my $optimizer    = 'adam'  ),
-    'mom=f'           => \(my $mom          = 0       ),
-    'wd=f'            => \(my $wd           = 0.00001 ),
-    'batch-size=i'    => \(my $batch_size   = 32      ),
-    'disp-batches=i'  => \(my $disp_batches = 50      ),
-    'model-prefix=s'  => \(my $model_prefix = 'lstm_' ),
-    'load-epoch=i'    => \(my $load_epoch   = 0       ),
-    'stack-rnn'       => \(my $stack_rnn              ),
-    'bidirectional=i' => \(my $bidirectional          ),
-    'dropout=f',      => \(my $dropout      = 0       ),
-    'help'           => sub { HelpMessage(0) },
-) or HelpMessage(1);
-
-=head1 NAME
-
-    char_lstm.pl - Example of training char LSTM RNN on tiny shakespeare using high level RNN interface
-
-=head1 SYNOPSIS
-
-    --test           Whether to test or train (default 0)
-    --num-layers     number of stacked RNN layers, default=2
-    --num-hidden     hidden layer size, default=200
-    --num-seq        sequence size, default=32
-    --gpus           list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.
-                     Increase batch size when using multiple gpus for best performance.
-    --kv-store       key-value store type, default='device'
-    --num-epochs     max num of epochs, default=25
-    --lr             initial learning rate, default=0.01
-    --optimizer      the optimizer type, default='adam'
-    --mom            momentum for sgd, default=0.0
-    --wd             weight decay for sgd, default=0.00001
-    --batch-size     the batch size type, default=32
-    --disp-batches   show progress for every n batches, default=50
-    --model-prefix   prefix for checkpoint files for loading/saving, default='lstm_'
-    --load-epoch     load from epoch
-    --stack-rnn      stack rnn to reduce communication overhead (1,0 default 0)
-    --bidirectional  whether to use bidirectional layers (1,0 default 0)
-    --dropout        dropout probability (1.0 - keep probability), default 0
-=cut
-
-$bidirectional = $bidirectional ? 1 : 0;
-$stack_rnn     = $stack_rnn     ? 1 : 0;
-
-func tokenize_text($fname, :$vocab=, :$invalid_label=-1, :$start_label=0)
-{
-    open(F, $fname) or die "Can't open $fname: $!";
-    my @lines = map { my $l = [split(/ /)]; shift(@$l); $l } (<F>);
-    my $sentences;
-    ($sentences, $vocab) = mx->rnn->encode_sentences(
-        \@lines,
-        vocab         => $vocab,
-        invalid_label => $invalid_label,
-        start_label   => $start_label
-    );
-    return ($sentences, $vocab);
-}
-
-my $buckets = [10, 20, 30, 40, 50, 60];
-my $start_label   = 1;
-my $invalid_label = 0;
-
-func get_data($layout)
-{
-    my ($train_sentences, $vocabulary) = tokenize_text(
-        './data/sherlockholmes.train.txt', start_label => $start_label,
-        invalid_label => $invalid_label
-    );
-    my ($validation_sentences) = tokenize_text(
-        './data/sherlockholmes.test.txt', vocab => $vocabulary,
-        start_label => $start_label, invalid_label => $invalid_label
-    );
-    my $data_train  = mx->rnn->BucketSentenceIter(
-        $train_sentences, $batch_size, buckets => $buckets,
-        invalid_label => $invalid_label,
-        layout        => $layout
-    );
-    my $data_val    = mx->rnn->BucketSentenceIter(
-        $validation_sentences, $batch_size, buckets => $buckets,
-        invalid_label => $invalid_label,
-        layout        => $layout
-    );
-    return ($data_train, $data_val, $vocabulary);
-}
-
-my $train = sub
-{
-    my ($data_train, $data_val, $vocab) = get_data('TN');
-    my $cell;
-    if($stack_rnn)
-    {
-        my $stack = mx->rnn->SequentialRNNCell();
-        for my $i (0..$num_layers-1)
-        {
-            my $dropout_rate = 0;
-            if($i < $num_layers-1)
-            {
-                $dropout_rate = $dropout;
-            }
-            $stack->add(
-                mx->rnn->FusedRNNCell(
-                    $num_hidden, num_layers => 1,
-                    mode => 'lstm', prefix => "lstm_$i",
-                    bidirectional => $bidirectional, dropout => $dropout_rate
-                )
-            );
-        }
-        $cell = $stack;
-    }
-    else
-    {
-        $cell = mx->rnn->FusedRNNCell(
-            $num_hidden, mode => 'lstm', num_layers => $num_layers,
-            bidirectional => $bidirectional, dropout => $dropout
-        );
-    }
-
-    my $sym_gen = sub { my $seq_len = shift;
-        my $data = mx->sym->Variable('data');
-        my $label = mx->sym->Variable('softmax_label');
-        my $embed = mx->sym->Embedding(data=>$data, input_dim=>scalar(keys %$vocab), output_dim=>$num_embed,name=>'embed');
-        my ($output) = $cell->unroll($seq_len, inputs=>$embed, merge_outputs=>1, layout=>'TNC');
-        my $pred = mx->sym->Reshape($output, shape=>[-1, $num_hidden*(1+$bidirectional)]);
-        $pred = mx->sym->FullyConnected(data=>$pred, num_hidden=>scalar(keys %$vocab), name=>'pred');
-        $label = mx->sym->Reshape($label, shape=>[-1]);
-        $pred = mx->sym->SoftmaxOutput(data=>$pred, label=>$label, name=>'softmax');
-        return ($pred, ['data'], ['softmax_label']);
-    };
-
-    my $contexts;
-    if(defined $gpus)
-    {
-        $contexts = [map { mx->gpu($_) } split(/,/, $gpus)];
-    }
-    else
-    {
-        $contexts = mx->cpu(0);
-    }
-
-    my $model = mx->mod->BucketingModule(
-        sym_gen             => $sym_gen,
-        default_bucket_key  => $data_train->default_bucket_key,
-        context             => $contexts
-    );
-
-    my ($arg_params, $aux_params);
-    if($load_epoch)
-    {
-        (undef, $arg_params, $aux_params) = mx->rnn->load_rnn_checkpoint(
-            $cell, $model_prefix, $load_epoch);
-    }
-    $model->fit(
-        $data_train,
-        eval_data           => $data_val,
-        eval_metric         => mx->metric->Perplexity($invalid_label),
-        kvstore             => $kv_store,
-        optimizer           => $optimizer,
-        optimizer_params    => {
-                                learning_rate => $lr,
-                                momentum      => $mom,
-                                wd            => $wd,
-                            },
-        begin_epoch         => $load_epoch,
-        initializer         => mx->init->Xavier(factor_type => "in", magnitude => 2.34),
-        num_epoch           => $num_epoch,
-        batch_end_callback  => mx->callback->Speedometer($batch_size, $disp_batches),
-        ($model_prefix ? (epoch_end_callback  => mx->rnn->do_rnn_checkpoint($cell, $model_prefix, 1)) : ())
-    );
-};
-
-my $test = sub {
-    assert($model_prefix, "Must specifiy path to load from");
-    my (undef, $data_val, $vocab) = get_data('NT');
-    my $stack;
-    if($stack_rnn)
-    {
-        $stack = mx->rnn->SequentialRNNCell();
-        for my $i (0..$num_layers-1)
-        {
-            my $cell = mx->rnn->LSTMCell(num_hidden => $num_hidden, prefix => "lstm_${i}l0_");
-            if($bidirectional)
-            {
-                $cell = mx->rnn->BidirectionalCell(
-                    $cell,
-                    mx->rnn->LSTMCell(
-                        num_hidden => $num_hidden,
-                        prefix => "lstm_${i}r0_"
-                    ),
-                    output_prefix => "bi_lstm_$i"
-                );
-            }
-            $stack->add($cell);
-        }
-    }
-    else
-    {
-        $stack = mx->rnn->FusedRNNCell(
-            $num_hidden,  num_layers    => $num_layers,
-            mode=>'lstm', bidirectional => $bidirectional
-        )->unfuse()
-    }
-    my $sym_gen = sub {
-        my $seq_len = shift;
-        my $data  = mx->sym->Variable('data');
-        my $label = mx->sym->Variable('softmax_label');
-        my $embed = mx->sym->Embedding(
-            data => $data, input_dim => scalar(keys %$vocab),
-            output_dim => $num_embed, name => 'embed'
-        );
-        $stack->reset;
-        my ($outputs, $states) = $stack->unroll($seq_len, inputs => $embed, merge_outputs => 1);
-        my $pred = mx->sym->Reshape($outputs, shape => [-1, $num_hidden*(1+$bidirectional)]);
-        $pred    = mx->sym->FullyConnected(data => $pred, num_hidden => scalar(keys %$vocab), name => 'pred');
-        $label   = mx->sym->Reshape($label, shape => [-1]);
-        $pred    = mx->sym->SoftmaxOutput(data => $pred, label => $label, name => 'softmax');
-        return ($pred, ['data'], ['softmax_label']);
-    };
-    my $contexts;
-    if($gpus)
-    {
-        $contexts = [map { mx->gpu($_) } split(/,/, $gpus)];
-    }
-    else
-    {
-        $contexts = mx->cpu(0);
-    }
-
-    my ($arg_params, $aux_params);
-    if($load_epoch)
-    {
-        (undef, $arg_params, $aux_params) = mx->rnn->load_rnn_checkpoint(
-            $stack, $model_prefix, $load_epoch);
-    }
-    my $model = mx->mod->BucketingModule(
-        sym_gen             => $sym_gen,
-        default_bucket_key  => $data_val->default_bucket_key,
-        context             => $contexts
-    );
-    $model->bind(
-        data_shapes  => $data_val->provide_data,
-        label_shapes => $data_val->provide_label,
-        for_training => 0,
-        force_rebind => 0
-    );
-    $model->set_params($arg_params, $aux_params);
-    my $score = $model->score($data_val,
-        mx->metric->Perplexity($invalid_label),
-        batch_end_callback=>mx->callback->Speedometer($batch_size, 5)
-    );
-};
-
-if($num_layers >= 4 and split(/,/,$gpus) >= 4 and not $stack_rnn)
-{
-    print("WARNING: stack-rnn is recommended to train complex model on multiple GPUs\n");
-}
-
-if($do_test)
-{
-    # Demonstrates how to load a model trained with CuDNN RNN and predict
-    # with non-fused MXNet symbol
-    $test->();
-}
-else
-{
-    $train->();
-}
diff --git a/perl-package/AI-MXNet/examples/lstm_bucketing.pl b/perl-package/AI-MXNet/examples/lstm_bucketing.pl
deleted file mode 100755
index 168c7c2be30f..000000000000
--- a/perl-package/AI-MXNet/examples/lstm_bucketing.pl
+++ /dev/null
@@ -1,157 +0,0 @@
-#!/usr/bin/env perl
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use PDL;
-use AI::MXNet qw(mx);
-use AI::MXNet::Function::Parameters;
-use Getopt::Long qw(HelpMessage);
-
-GetOptions(
-    'num-layers=i'   => \(my $num_layers   = 2       ),
-    'num-hidden=i'   => \(my $num_hidden   = 200     ),
-    'num-embed=i'    => \(my $num_embed    = 200     ),
-    'gpus=s'         => \(my $gpus                   ),
-    'kv-store=s'     => \(my $kv_store     = 'device'),
-    'num-epoch=i'    => \(my $num_epoch    = 25      ),
-    'lr=f'           => \(my $lr           = 0.01    ),
-    'optimizer=s'    => \(my $optimizer    = 'sgd'   ),
-    'mom=f'          => \(my $mom          = 0       ),
-    'wd=f'           => \(my $wd           = 0.00001 ),
-    'batch-size=i'   => \(my $batch_size   = 32      ),
-    'disp-batches=i' => \(my $disp_batches = 50      ),
-    'chkp-prefix=s'  => \(my $chkp_prefix  = 'lstm_' ),
-    'chkp-epoch=i'   => \(my $chkp_epoch   = 0       ),
-    'help'           => sub { HelpMessage(0) },
-) or HelpMessage(1);
-
-=head1 NAME
-
-    lstm_bucketing.pl - Example of training LSTM RNN on Sherlock Holmes data using high level RNN interface
-
-=head1 SYNOPSIS
-
-    --num-layers     number of stacked RNN layers, default=2
-    --num-hidden     hidden layer size, default=200
-    --num-embed      embedding layer size, default=200
-    --gpus           list of gpus to run, e.g. 0 or 0,2,5. empty means using cpu.
-                     Increase batch size when using multiple gpus for best performance.
-    --kv-store       key-value store type, default='device'
-    --num-epochs     max num of epochs, default=25
-    --lr             initial learning rate, default=0.01
-    --optimizer      the optimizer type, default='sgd'
-    --mom            momentum for sgd, default=0.0
-    --wd             weight decay for sgd, default=0.00001
-    --batch-size     the batch size type, default=32
-    --disp-batches   show progress for every n batches, default=50
-    --chkp-prefix    prefix for checkpoint files, default='lstm_'
-    --chkp-epoch     save checkpoint after this many epoch, default=0 (saving checkpoints is disabled)
-
-=cut
-func tokenize_text($fname, :$vocab=, :$invalid_label=-1, :$start_label=0)
-{
-    open(F, $fname) or die "Can't open $fname: $!";
-    my @lines = map { my $l = [split(/ /)]; shift(@$l); $l } (<F>);
-    my $sentences;
-    ($sentences, $vocab) = mx->rnn->encode_sentences(
-        \@lines,
-        vocab         => $vocab,
-        invalid_label => $invalid_label,
-        start_label   => $start_label
-    );
-    return ($sentences, $vocab);
-}
-
-my $buckets = [10, 20, 30, 40, 50, 60];
-my $start_label   = 1;
-my $invalid_label = 0;
-
-my ($train_sentences, $vocabulary) = tokenize_text(
-    './data/sherlockholmes.train.txt', start_label => $start_label,
-    invalid_label => $invalid_label
-);
-my ($validation_sentences) = tokenize_text(
-    './data/sherlockholmes.test.txt', vocab => $vocabulary,
-    start_label => $start_label, invalid_label => $invalid_label
-);
-my $data_train  = mx->rnn->BucketSentenceIter(
-    $train_sentences, $batch_size, buckets => $buckets,
-    invalid_label => $invalid_label
-);
-my $data_val    = mx->rnn->BucketSentenceIter(
-    $validation_sentences, $batch_size, buckets => $buckets,
-    invalid_label => $invalid_label
-);
-
-my $stack = mx->rnn->SequentialRNNCell();
-for my $i (0..$num_layers-1)
-{
-    $stack->add(mx->rnn->LSTMCell(num_hidden => $num_hidden, prefix => "lstm_l${i}_"));
-}
-
-my $sym_gen = sub {
-    my $seq_len = shift;
-    my $data  = mx->sym->Variable('data');
-    my $label = mx->sym->Variable('softmax_label');
-    my $embed = mx->sym->Embedding(
-        data => $data, input_dim => scalar(keys %$vocabulary),
-        output_dim => $num_embed, name => 'embed'
-    );
-    $stack->reset;
-    my ($outputs, $states) = $stack->unroll($seq_len, inputs => $embed, merge_outputs => 1);
-    my $pred = mx->sym->Reshape($outputs, shape => [-1, $num_hidden]);
-    $pred    = mx->sym->FullyConnected(data => $pred, num_hidden => scalar(keys %$vocabulary), name => 'pred');
-    $label   = mx->sym->Reshape($label, shape => [-1]);
-    $pred    = mx->sym->SoftmaxOutput(data => $pred, label => $label, name => 'softmax');
-    return ($pred, ['data'], ['softmax_label']);
-};
-
-my $contexts;
-if(defined $gpus)
-{
-    $contexts = [map { mx->gpu($_) } split(/,/, $gpus)];
-}
-else
-{
-    $contexts = mx->cpu(0);
-}
-
-my $model = mx->mod->BucketingModule(
-    sym_gen             => $sym_gen,
-    default_bucket_key  => $data_train->default_bucket_key,
-    context             => $contexts
-);
-
-$model->fit(
-    $data_train,
-    eval_data           => $data_val,
-    eval_metric         => mx->metric->Perplexity($invalid_label),
-    kvstore             => $kv_store,
-    optimizer           => $optimizer,
-    optimizer_params    => {
-                                learning_rate => $lr,
-                                momentum      => $mom,
-                                wd            => $wd,
-                        },
-    initializer         => mx->init->Xavier(factor_type => "in", magnitude => 2.34),
-    num_epoch           => $num_epoch,
-    batch_end_callback  => mx->callback->Speedometer($batch_size, $disp_batches),
-    ($chkp_epoch ? (epoch_end_callback  => mx->rnn->do_rnn_checkpoint($stack, $chkp_prefix, $chkp_epoch)) : ())
-);
diff --git a/perl-package/AI-MXNet/examples/mnist.pl b/perl-package/AI-MXNet/examples/mnist.pl
deleted file mode 100755
index f8445ebbd4ae..000000000000
--- a/perl-package/AI-MXNet/examples/mnist.pl
+++ /dev/null
@@ -1,202 +0,0 @@
-#!/usr/bin/env perl
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-# derived from https://mxnet.io/tutorials/python/mnist.html
-use LWP::UserAgent ();
-use PDL ();
-#use Gtk2 '-init';
-use AI::MXNet ('mx');
-
-my $ua = LWP::UserAgent->new();
-
-sub download_data {
-    my($url, $force_download) = @_;
-    $force_download = 1 if @_ < 2;
-    my $fname = (split m{/}, $url)[-1];
-    if($force_download or not -f $fname) {
-        $ua->get($url, ':content_file' => $fname);
-    }
-    return $fname;
-}
-
-sub read_data {
-    my($label_url, $image_url) = @_;
-    my($magic, $num, $rows, $cols);
-
-    open my($flbl), '<:gzip', download_data($label_url);
-    read $flbl, my($buf), 8;
-    ($magic, $num) = unpack 'N2', $buf;
-    my $label = PDL->new();
-    $label->set_datatype($PDL::Types::PDL_B);
-    $label->setdims([ $num ]);
-    read $flbl, ${$label->get_dataref}, $num;
-    $label->upd_data();
-
-    open my($fimg), '<:gzip', download_data($image_url);
-    read $fimg, $buf, 16;
-    ($magic, $num, $rows, $cols) = unpack 'N4', $buf;
-    my $image = PDL->new();
-    $image->set_datatype($PDL::Types::PDL_B);
-    $image->setdims([ $rows, $cols, $num ]);
-    read $fimg, ${$image->get_dataref}, $num * $rows * $cols;
-    $image->upd_data();
-
-    return($label, $image);
-}
-
-my $path='http://yann.lecun.com/exdb/mnist/';
-my($train_lbl, $train_img) = read_data(
-    "${path}train-labels-idx1-ubyte.gz", "${path}train-images-idx3-ubyte.gz");
-my($val_lbl, $val_img) = read_data(
-    "${path}t10k-labels-idx1-ubyte.gz", "${path}t10k-images-idx3-ubyte.gz");
-
-sub show_sample {
-    print 'label: ', $train_lbl->slice('0:9'), "\n";
-    my $hbox = Gtk2::HBox->new(0, 2);
-    for my $i (0 .. 9) {
-        my $img = $train_img->slice(":,:,$i");
-        my($w, $h) = $img->dims;
-        $img->make_physical();
-        # ugh, pixbufs don't have a grayscale colorspace?!
-        # burst it to rgb I guess.
-        my $data = pack 'c*', map { $_, $_, $_ } unpack 'c*', ${$img->get_dataref};
-        $hbox->add(Gtk2::Image->new_from_pixbuf(
-            Gtk2::Gdk::Pixbuf->new_from_data($data, 'rgb', 0, 8, $w, $h, $w * 3)
-        ));
-    }
-    my $win = Gtk2::Window->new('toplevel');
-    $win->signal_connect(delete_event => sub { Gtk2->main_quit() });
-    $win->add($hbox);
-    $win->show_all();
-    Gtk2->main();
-}
-
-sub show_network {
-    my($viz) = @_;
-    my $load = Gtk2::Gdk::PixbufLoader->new();
-    $load->write($viz->graph->as_png);
-    $load->close();
-    my $img = Gtk2::Image->new_from_pixbuf($load->get_pixbuf());
-    my $sw = Gtk2::ScrolledWindow->new(undef, undef);
-    $sw->add_with_viewport($img);
-    my $win = Gtk2::Window->new('toplevel');
-    $win->signal_connect(delete_event => sub { Gtk2->main_quit() });
-    $win->add($sw);
-    $win->show_all();
-    Gtk2->main();
-}
-
-#show_sample();
-
-sub to4d {
-    my($img) = @_;
-    return $img->reshape(28, 28, 1, ($img->dims)[2])->float / 255;
-}
-
-my $batch_size = 100;
-my $train_iter = mx->io->NDArrayIter(
-    data => to4d($train_img),
-    label => $train_lbl,
-    batch_size => $batch_size,
-    shuffle => 1,
-);
-my $val_iter = mx->io->NDArrayIter(
-    data => to4d($val_img),
-    label => $val_lbl,
-    batch_size => $batch_size,
-);
-
-# Create a place holder variable for the input data
-my $data = mx->sym->Variable('data');
-
-sub nn_fc {
-    # Epoch[9] Train-accuracy=0.978889
-    # Epoch[9] Time cost=145.437
-    # Epoch[9] Validation-accuracy=0.964600
-    my($data) = @_;
-
-    # Flatten the data from 4-D shape (batch_size, num_channel, width, height)
-    # into 2-D (batch_size, num_channel*width*height)
-    $data = mx->sym->Flatten(data => $data);
-
-    # The first fully-connected layer
-#    my $fc1  = mx->sym->FullyConnected(data => $data, name => 'fc1', num_hidden => 128);
-#    # Apply relu to the output of the first fully-connnected layer
-#    my $act1 = mx->sym->Activation(data => $fc1, name => 'relu1', act_type => "relu");
-
-    # The second fully-connected layer and the according activation function
-    my $fc2  = mx->sym->FullyConnected(data => $data, name => 'fc2', num_hidden => 64);
-    my $act2 = mx->sym->Activation(data => $fc2, name => 'relu2', act_type => "relu");
-
-    # The thrid fully-connected layer, note that the hidden size should be 10, which is the number of unique digits
-    my $fc3  = mx->sym->FullyConnected(data => $act2, name => 'fc3', num_hidden => 10);
-    # The softmax and loss layer
-    my $mlp  = mx->sym->SoftmaxOutput(data => $fc3, name => 'softmax');
-    return $mlp;
-}
-
-sub nn_conv {
-    my($data) = @_;
-    # Epoch[9] Batch [200]	Speed: 1625.07 samples/sec	Train-accuracy=0.992090
-    # Epoch[9] Batch [400]	Speed: 1630.12 samples/sec	Train-accuracy=0.992850
-    # Epoch[9] Train-accuracy=0.991357
-    # Epoch[9] Time cost=36.817
-    # Epoch[9] Validation-accuracy=0.988100
-
-    my $conv1= mx->symbol->Convolution(data => $data, name => 'conv1', num_filter => 20, kernel => [5,5], stride => [2,2]);
-    my $bn1  = mx->symbol->BatchNorm(data => $conv1, name => "bn1");
-    my $act1 = mx->symbol->Activation(data => $bn1, name => 'relu1', act_type => "relu");
-    my $mp1  = mx->symbol->Pooling(data => $act1, name => 'mp1', kernel => [2,2], stride =>[1,1], pool_type=>'max');
-
-    my $conv2= mx->symbol->Convolution(data => $mp1, name => 'conv2', num_filter => 50, kernel=>[3,3], stride=>[2,2]);
-    my $bn2  = mx->symbol->BatchNorm(data => $conv2, name=>"bn2");
-    my $act2 = mx->symbol->Activation(data => $bn2, name=>'relu2', act_type=>"relu");
-    my $mp2  = mx->symbol->Pooling(data => $act2, name => 'mp2', kernel=>[2,2], stride=>[1,1], pool_type=>'max');
-
-
-    my $fl   = mx->symbol->Flatten(data => $mp2, name=>"flatten");
-    my $fc1  = mx->symbol->FullyConnected(data => $fl,  name=>"fc1", num_hidden=>100);
-    my $act3 = mx->symbol->Activation(data => $fc1, name=>'relu3', act_type=>"relu");
-    my $fc2  = mx->symbol->FullyConnected(data => $act3, name=>'fc2', num_hidden=>30);
-    my $act4 = mx->symbol->Activation(data => $fc2, name=>'relu4', act_type=>"relu");
-    my $fc3  = mx->symbol->FullyConnected(data => $act4, name=>'fc3', num_hidden=>10);
-    my $softmax = mx->symbol->SoftmaxOutput(data => $fc3, name => 'softmax');
-    return $softmax;
-}
-
-my $mlp = $ARGV[0] ? nn_conv($data) : nn_fc($data);
-
-#We visualize the network structure with output size (the batch_size is ignored.)
-#my $shape = { data => [ $batch_size, 1, 28, 28 ] };
-#show_network(mx->viz->plot_network($mlp, shape => $shape));
-
-my $model = mx->mod->Module(
-    symbol => $mlp,       # network structure
-);
-$model->fit(
-    $train_iter,       # training data
-    num_epoch => 10,      # number of data passes for training
-    eval_data => $val_iter, # validation data
-    batch_end_callback => mx->callback->Speedometer($batch_size, 200), # output progress for each 200 data batches
-    optimizer => 'adam',
-);
-
-
diff --git a/perl-package/AI-MXNet/examples/plot_network.pl b/perl-package/AI-MXNet/examples/plot_network.pl
deleted file mode 100755
index bf39988e7105..000000000000
--- a/perl-package/AI-MXNet/examples/plot_network.pl
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env perl
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet qw(mx);
-
-### model
-my $data = mx->symbol->Variable('data');
-my $conv1= mx->symbol->Convolution(data => $data, name => 'conv1', num_filter => 32, kernel => [3,3], stride => [2,2]);
-my $bn1  = mx->symbol->BatchNorm(data => $conv1, name => "bn1");
-my $act1 = mx->symbol->Activation(data => $bn1, name => 'relu1', act_type => "relu");
-my $mp1  = mx->symbol->Pooling(data => $act1, name => 'mp1', kernel => [2,2], stride =>[2,2], pool_type=>'max');
-
-my $conv2= mx->symbol->Convolution(data => $mp1, name => 'conv2', num_filter => 32, kernel=>[3,3], stride=>[2,2]);
-my $bn2  = mx->symbol->BatchNorm(data => $conv2, name=>"bn2");
-my $act2 = mx->symbol->Activation(data => $bn2, name=>'relu2', act_type=>"relu");
-my $mp2  = mx->symbol->Pooling(data => $act2, name => 'mp2', kernel=>[2,2], stride=>[2,2], pool_type=>'max');
-
-
-my $fl   = mx->symbol->Flatten(data => $mp2, name=>"flatten");
-my $fc1  = mx->symbol->FullyConnected(data => $fl,  name=>"fc1", num_hidden=>30);
-my $act3 = mx->symbol->Activation(data => $fc1, name=>'relu3', act_type=>"relu");
-my $fc2  = mx->symbol->FullyConnected(data => $act3, name=>'fc2', num_hidden=>10);
-my $softmax = mx->symbol->SoftmaxOutput(data => $fc2, name => 'softmax');
-
-## creates the image file in working directory, you need GraphViz installed for this to work
-mx->viz->plot_network($softmax, save_format => 'png')->render("network.png");
diff --git a/perl-package/AI-MXNet/examples/sparse/matrix_factorization/README.md b/perl-package/AI-MXNet/examples/sparse/matrix_factorization/README.md
deleted file mode 100644
index 3eb1bab508e5..000000000000
--- a/perl-package/AI-MXNet/examples/sparse/matrix_factorization/README.md
+++ /dev/null
@@ -1,26 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-Matrix Factorization w/ Sparse Embedding
-===========
-The example demonstrates the basic usage of the SparseEmbedding operator in MXNet, adapted based on @leopd's recommender examples.
-The operator is available on both CPU and GPU. This is for demonstration purpose only.
-
-- get_data.sh
-- perl train.pl
-- To compare the training speed with (dense) Embedding, run perl train.pl --use-dense 1
-- To run the example on the GPU, run perl train.pl --use-gpu 1
diff --git a/perl-package/AI-MXNet/examples/sparse/matrix_factorization/get_data.sh b/perl-package/AI-MXNet/examples/sparse/matrix_factorization/get_data.sh
deleted file mode 100755
index b8b14e136d17..000000000000
--- a/perl-package/AI-MXNet/examples/sparse/matrix_factorization/get_data.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-DIR=$(cd `dirname $0`; pwd)
-DATA_DIR="${DIR}/data/"
-
-if [[ ! -d "${DATA_DIR}" ]]; then
-  echo "${DATA_DIR} doesn't exist, will create one";
-  mkdir -p ${DATA_DIR}
-fi
-
-wget -P ${DATA_DIR} http://files.grouplens.org/datasets/movielens/ml-10m.zip
-cd ${DATA_DIR}
-unzip ml-10m.zip
-cd ml-10M100K
-chmod +x allbut.pl
-sh split_ratings.sh
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/examples/sparse/matrix_factorization/train.pl b/perl-package/AI-MXNet/examples/sparse/matrix_factorization/train.pl
deleted file mode 100755
index fa6a76376b8e..000000000000
--- a/perl-package/AI-MXNet/examples/sparse/matrix_factorization/train.pl
+++ /dev/null
@@ -1,184 +0,0 @@
-#!/usr/bin/env perl
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet qw(mx);
-use Getopt::Long qw(HelpMessage);
-
-GetOptions(
-    'print-every=i'  => \(my $print_every  = 100),
-    'factor-size=i'  => \(my $factor_size  = 128),
-    'use-gpu=i'      => \(my $use_gpu      = 0  ),
-    'num-epoch=i'    => \(my $num_epoch    = 3  ),
-    'batch-size=i'   => \(my $batch_size   = 128),
-    'use-dense=i'    => \(my $use_dense    = 0  ),
-    'help'           => sub { HelpMessage(0) },
-) or HelpMessage(1);
-
-=head1 NAME
-
-    train.pl - Run matrix factorization with sparse embedding
-
-=head1 SYNOPSIS
-
-    --print-every  logging frequency, 100
-    --factor-size  the factor size of the embedding operation, 128
-    --use-gpu      use gpu, 0
-    --num-epoch    number of epochs to train, 3
-    --batch-size   number of examples per batch, 128
-    --use-dense    use the dense embedding operator, 0
-
-=cut
-
-my %MOVIELENS = (
-    dataset   => 'ml-10m',
-    train     => './data/ml-10M100K/r1.train',
-    val       =>  './data/ml-10M100K/r1.test',
-    max_user  => 71569,
-    max_movie => 65135,
-);
-
-sub get_movielens_iter
-{
-    my ($filename, $batch_size) = @_;
-    print "Preparing data iterators for $filename ... \n";
-    my @user;
-    my @item;
-    my @score;
-    open(F, $filename) or die $!;
-    my $num_samples = 0;
-    while(my $line = <F>)
-    {
-        my @tks = split('::', $line);
-        next unless @tks == 4;
-        $num_samples++;
-        push @user,  [$tks[0]];
-        push @item,  [$tks[1]];
-        push @score, [$tks[2]];
-    }
-    # convert to ndarrays
-    my $user = mx->nd->array(\@user, dtype=>'int32');
-    my $item = mx->nd->array(\@item);
-    my $score = mx->nd->array(\@score);
-    return mx->io->NDArrayIter(
-        data  => Hash::Ordered->new(user => $user, item => $item),
-        label => Hash::Ordered->new(score => $score),
-        batch_size => $batch_size,
-        shuffle => 1
-    );
-}
-
-sub matrix_fact_net
-{
-    my ($factor_size, $num_hidden, $max_user, $max_item, $sparse_embed) = @_;
-    $sparse_embed //= 1;
-    # input
-    my $user = mx->symbol->Variable('user');
-    my $item = mx->symbol->Variable('item');
-    my $score = mx->symbol->Variable('score');
-    if($sparse_embed)
-    {
-        # user feature lookup
-        my $user_weight = mx->symbol->Variable('user_weight', stype=>'row_sparse');
-        $user = mx->symbol->contrib->SparseEmbedding(data=>$user, weight=>$user_weight,
-                                                 input_dim=>$max_user, output_dim=>$factor_size);
-        # item feature lookup
-        my $item_weight = mx->symbol->Variable('item_weight', stype=>'row_sparse');
-        $item = mx->symbol->contrib->SparseEmbedding(data=>$item, weight=>$item_weight,
-                                                 input_dim=>$max_item, output_dim=>$factor_size);
-    }
-    else
-    {
-        # user feature lookup
-        $user = mx->symbol->Embedding(data=>$user, input_dim=>$max_user, output_dim=>$factor_size);
-        # item feature lookup
-        $item = mx->symbol->Embedding(data=>$item, input_dim=>$max_item, output_dim=>$factor_size);
-    }
-    # non-linear transformation of user features
-    $user = mx->symbol->Activation(data=>$user, act_type=>'relu');
-    $user = mx->symbol->FullyConnected(data=>$user, num_hidden=>$num_hidden);
-    # non-linear transformation of item features
-    $item = mx->symbol->Activation(data=>$item, act_type=>'relu');
-    $item = mx->symbol->FullyConnected(data=>$item, num_hidden=>$num_hidden);
-    # predict by the inner product, which is elementwise product and then sum
-    my $pred = $user * $item;
-    $pred = mx->symbol->sum(data=>$pred, axis => 1);
-    $pred = mx->symbol->Flatten(data=>$pred);
-    # loss layer
-    $pred = mx->symbol->LinearRegressionOutput(data=>$pred, label=>$score);
-    return $pred;
-}
-
-my $optimizer = 'sgd';
-my $use_sparse = not $use_dense;
-
-my $momentum = 0.9;
-my $ctx = $use_gpu ? mx->gpu(0) : mx->cpu(0);
-my $learning_rate = 0.1;
-
-# prepare dataset and iterators
-my $max_user   = $MOVIELENS{max_user};
-my $max_movies = $MOVIELENS{max_movie};
-my $train_iter = get_movielens_iter($MOVIELENS{train}, $batch_size);
-my $val_iter   = get_movielens_iter($MOVIELENS{val}  , $batch_size);
-
-# construct the model
-my $net = matrix_fact_net($factor_size, $factor_size, $max_user, $max_movies, $use_sparse);
-
-# initialize the module
-my $mod = mx->module->Module(symbol=>$net, context=>$ctx, data_names=>['user', 'item'],
-                           label_names=>['score']);
-$mod->bind(data_shapes=>$train_iter->provide_data, label_shapes=>$train_iter->provide_label);
-$mod->init_params(initializer=>mx->init->Xavier(factor_type=>"in", magnitude=>2.34));
-my $optim = mx->optimizer->create($optimizer, learning_rate=>$learning_rate, momentum=>$momentum,
-                                wd=>1e-4, rescale_grad=>1.0/$batch_size);
-$mod->init_optimizer(optimizer=>$optim);
-
-# use MSE as the metric
-my $metric = mx->metric->create(['MSE']);
-my $speedometer = mx->callback->Speedometer($batch_size, $print_every);
-print "Training started ...\n";
-for my $epoch (0..$num_epoch-1)
-{
-    my $nbatch = 0;
-    $metric->reset();
-    while(my $batch = <$train_iter>)
-    {
-        $nbatch += 1;
-        $mod->forward_backward($batch);
-        # update all parameters
-        $mod->update();
-        # update training metric
-        $mod->update_metric($metric, $batch->label);
-        my $speedometer_param = AI::MXNet::BatchEndParam->new(
-            epoch=>$epoch, nbatch=>$nbatch,
-            eval_metric=>$metric
-        );
-        $speedometer->($speedometer_param);
-    }
-    # evaluate metric on validation dataset
-    my $score = $mod->score($val_iter, ['MSE']);
-    printf("epoch %d, eval MSE = %s \n", $epoch, $score->{mse});
-    # reset the iterator for next pass of data
-    $train_iter->reset();
-    $val_iter->reset();
-}
-print "Training completed.\n";
-
diff --git a/perl-package/AI-MXNet/examples/sparse/wide_deep/README.md b/perl-package/AI-MXNet/examples/sparse/wide_deep/README.md
deleted file mode 100644
index 4a01da4254e6..000000000000
--- a/perl-package/AI-MXNet/examples/sparse/wide_deep/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-## Wide and Deep Learning
-
-The example demonstrates how to train [wide and deep model](https://arxiv.org/abs/1606.07792). The [Census Income Data Set](https://archive.ics.uci.edu/ml/datasets/Census+Income) that this example uses for training is hosted by the [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/). Tricks of feature engineering are adapted from tensorflow's [wide and deep tutorial](https://github.com/tensorflow/models/tree/master/official/wide_deep).
-
-The final accuracy should be around 84%.
-
-- perl train.pl
diff --git a/perl-package/AI-MXNet/examples/sparse/wide_deep/get_data.sh b/perl-package/AI-MXNet/examples/sparse/wide_deep/get_data.sh
deleted file mode 100755
index 69e5b4dc177f..000000000000
--- a/perl-package/AI-MXNet/examples/sparse/wide_deep/get_data.sh
+++ /dev/null
@@ -1,32 +0,0 @@
-#!/usr/bin/env bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-DIR=$(cd `dirname $0`; pwd)
-DATA_DIR="${DIR}/data/"
-
-if [[ ! -d "${DATA_DIR}" ]]; then
-  echo "${DATA_DIR} doesn't exist, will create one";
-  mkdir -p ${DATA_DIR}
-fi
-
-wget -P ${DATA_DIR} https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data
-wget -P ${DATA_DIR} https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test
-cd ${DATA_DIR}
-sed -i '1d' adult.test
-sed -i 's/.$//' adult.test
\ No newline at end of file
diff --git a/perl-package/AI-MXNet/examples/sparse/wide_deep/train.pl b/perl-package/AI-MXNet/examples/sparse/wide_deep/train.pl
deleted file mode 100755
index cd18c61c6f44..000000000000
--- a/perl-package/AI-MXNet/examples/sparse/wide_deep/train.pl
+++ /dev/null
@@ -1,301 +0,0 @@
-#!/usr/bin/env perl
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet qw(mx);
-use AI::MXNet::Base qw(pdl enumerate digitize hash array_index range);
-use Getopt::Long qw(HelpMessage);
-
-GetOptions(
-    'log-interval=i' => \(my $log_interval = 100),
-    'optimizer=s'    => \(my $optimizer    = 'adam'),
-    'cuda'           => \(my $cuda         = 0  ),
-    'num-epoch=i'    => \(my $num_epoch    = 10  ),
-    'batch-size=i'   => \(my $batch_size   = 100),
-    'lr=f'           => \(my $lr    = 0.001     ),
-    'help'           => sub { HelpMessage(0) },
-) or HelpMessage(1);
-
-=head1 NAME
-
-    train.pl - Run sparse wide and deep classification
-
-=head1 SYNOPSIS
-
-    --log-interval number of batches to wait before logging training status, 100
-    --optimizer    what optimizer to use, 'adam'
-    --cuda         train on gpu with cuda, 0
-    --num-epoch    number of epochs to train, 10
-    --batch-size   number of examples per batch, 100
-    --lr           learning rate, 0.001
-
-=cut
-
-my %allowed_optimizers = qw(adam 1 sgd 1 ftrl 1);
-Carp::confess("optimizer can only be one of 'adam', 'sgd', 'ftrl'") 
-    unless exists $allowed_optimizers{ $optimizer };
-
-sub wide_deep_model
-{
-    my ($num_linear_features, $num_embed_features, $num_cont_features,
-                    $input_dims, $hidden_units) = @_;
-    # wide model
-    my $csr_data = mx->symbol->Variable("csr_data", stype=>'csr');
-    my $label = mx->symbol->Variable("softmax_label");
-
-    my $norm_init = mx->initializer->Normal(sigma=>0.01);
-    # weight with row_sparse storage type to enable sparse gradient updates
-    my $weight = mx->symbol->Variable("linear_weight", shape=>[$num_linear_features, 2],
-                                init=>$norm_init, stype=>'row_sparse');
-    my $bias = mx->symbol->Variable("linear_bias", shape=>[2]);
-    my $dot = mx->symbol->sparse->dot($csr_data, $weight);
-    my $linear_out = mx->symbol->broadcast_add($dot, $bias);
-    # deep model
-    my $dns_data = mx->symbol->Variable("dns_data");
-    # embedding features
-    my $x = mx->symbol->slice(data=>$dns_data, begin=>[0, 0],
-                        end=>[undef, $num_embed_features]);
-    my $embeds = mx->symbol->split(data=>$x, num_outputs=>$num_embed_features, squeeze_axis=>1);
-    # continuous features
-    $x = mx->symbol->slice(data=>$dns_data, begin=>[0, $num_embed_features],
-                        end=>[undef, $num_embed_features + $num_cont_features]);
-    my @features = ($x);
-
-    enumerate(sub {
-        my ($i, $embed) = @_;
-        my $embed_weight = mx->symbol->Variable("embed_${i}_weight", stype=>'row_sparse');
-        push @features, mx->symbol->contrib->SparseEmbedding(data=>$embed, weight=>$embed_weight,
-                        input_dim=>$input_dims->[$i], output_dim=>$hidden_units->[0]);
-
-    }, $embeds);
-
-    my $hidden = mx->symbol->concat(@features, dim=>1);
-    $hidden = mx->symbol->FullyConnected(data=>$hidden, num_hidden=>$hidden_units->[1]);
-    $hidden = mx->symbol->Activation(data=>$hidden, act_type=>'relu');
-    $hidden = mx->symbol->FullyConnected(data=>$hidden, num_hidden=>$hidden_units->[2]);
-    $hidden = mx->symbol->Activation(data=>$hidden, act_type=>'relu');
-    my $deep_out = mx->symbol->FullyConnected(data=>$hidden, num_hidden=>2);
-
-    my $out = mx->symbol->SoftmaxOutput($linear_out + $deep_out, $label, name=>'model');
-    return $out;
-}
-
-sub preprocess_uci_adult
-{
-    my ($data_name) = @_;
-    # Some tricks of feature engineering are adapted
-    # from tensorflow's wide and deep tutorial.
-    my @csv_columns = (
-        "age", "workclass", "fnlwgt", "education", "education_num",
-        "marital_status", "occupation", "relationship", "race", "gender",
-        "capital_gain", "capital_loss", "hours_per_week", "native_country",
-        "income_bracket"
-    );
-
-    my %vocabulary_dict = (
-        "gender" => [
-            "Female", "Male"
-        ],
-        "education" => [
-            "Bachelors", "HS-grad", "11th", "Masters", "9th",
-            "Some-college", "Assoc-acdm", "Assoc-voc", "7th-8th",
-            "Doctorate", "Prof-school", "5th-6th", "10th", "1st-4th",
-            "Preschool", "12th"
-        ],
-        "marital_status" => [
-            "Married-civ-spouse", "Divorced", "Married-spouse-absent",
-            "Never-married", "Separated", "Married-AF-spouse", "Widowed"
-        ],
-        "relationship" => [
-            "Husband", "Not-in-family", "Wife", "Own-child", "Unmarried",
-            "Other-relative"
-        ],
-        "workclass" => [
-            "Self-emp-not-inc", "Private", "State-gov", "Federal-gov",
-            "Local-gov", "?", "Self-emp-inc", "Without-pay", "Never-worked"
-        ]
-    );
-    # wide columns
-    my @crossed_columns = (
-        ["education", "occupation"],
-        ["native_country", "occupation"],
-        ["age_buckets", "education", "occupation"],
-    );
-    my @age_boundaries = (18, 25, 30, 35, 40, 45, 50, 55, 60, 65);
-    # deep columns
-    my @indicator_columns = ('workclass', 'education', 'gender', 'relationship');
-
-    my @embedding_columns = ('native_country', 'occupation');
-
-    my @continuous_columns = ('age', 'education_num', 'capital_gain', 'capital_loss', 'hours_per_week');
-    # income_bracket column is the label
-    my %labels = ("<=50K" => 0, ">50K" => 1);
-
-    my $hash_bucket_size = 1000;
-
-    my $csr_ncols = @crossed_columns * $hash_bucket_size;
-    my $dns_ncols = @continuous_columns + @embedding_columns;
-    for my $col (@indicator_columns)
-    {
-        $dns_ncols += @{ $vocabulary_dict{$col} };
-    }
-
-    my @label_list;
-    my @csr_list;
-    my @dns_list;
-
-    open(F, $data_name) or die $!;
-    while(<F>)
-    {
-        chomp;
-        my %row;
-        @row{ @csv_columns } = split(/\s*,\s*/);
-        next if not defined $row{income_bracket};
-        push @label_list, $labels{$row{income_bracket}};
-        enumerate(sub {
-            my ($i, $cols) = @_;
-            if($cols->[0] eq "age_buckets")
-            {
-                my $age_bucket = digitize($row{age}, \@age_boundaries);
-                my $s = join('_', map { $row{$_} } @{ $cols }[1..@{$cols}-1]);
-                $s .= '_' . $age_bucket;
-                push @csr_list, [$i * $hash_bucket_size + hash($s) % $hash_bucket_size, 1];
-            }
-            else
-            {
-                my $s = join('_', map { $row{$_} } @{ $cols });
-                push @csr_list, [$i * $hash_bucket_size + hash($s) % $hash_bucket_size, 1];
-            }
-        }, \@crossed_columns);
-        my @dns_row = (0) x $dns_ncols;
-        my $dns_dim = 0;
-        for my $col (@embedding_columns)
-        {
-            $dns_row[$dns_dim] = hash($row{$col}) % $hash_bucket_size;
-            $dns_dim += 1;
-        }
-
-        for my $col (@indicator_columns)
-        {
-            $dns_row[$dns_dim + array_index($row{$col}, $vocabulary_dict{$col})] = 1;
-            $dns_dim += @{$vocabulary_dict{$col}};
-        }
-
-        for my $col (@continuous_columns)
-        {
-            $dns_row[$dns_dim] = $row{col};
-            $dns_dim += 1;
-        }
-        push @dns_list, \@dns_row;
-    }
-    my @data_list = map { $_->[1] } @csr_list;
-    my @indices_list = map { $_->[0] } @csr_list;
-    my @indptr_list = range(0, @indices_list + 1, scalar @crossed_columns);
-    # convert to ndarrays
-    my $csr = mx->nd->sparse->csr_matrix([\@data_list, \@indices_list, \@indptr_list],
-                                  shape=>[scalar @label_list, $hash_bucket_size * @crossed_columns]);
-    my $dns = pdl(\@dns_list);
-    my $label = pdl(\@label_list);
-    return ($csr, $dns, $label);
-}
-
-# Related to feature engineering, please see preprocess in data.py
-my %ADULT = (
-    train => './data/adult.data',
-    test  => './data/adult.test',
-    num_linear_features => 3000,
-    num_embed_features => 2,
-    num_cont_features => 38,
-    embed_input_dims => [1000, 1000],
-    hidden_units => [8, 50, 100],
-);
-
-
-my $ctx = $cuda ? mx->gpu : mx->cpu;
-
-# dataset
-my ($train_csr, $train_dns, $train_label) = preprocess_uci_adult($ADULT{train});
-my ($val_csr, $val_dns, $val_label) = preprocess_uci_adult($ADULT{test});
-my $model = wide_deep_model(
-    $ADULT{num_linear_features}, $ADULT{num_embed_features},
-    $ADULT{num_cont_features}, $ADULT{embed_input_dims},
-    $ADULT{hidden_units}
-);
-
-# data iterator
-my $train_data = mx->io->NDArrayIter(
-    data  => Hash::Ordered->new(csr_data => $train_csr, dns_data => $train_dns),
-    label => Hash::Ordered->new(softmax_label => $train_label), 
-    batch_size => $batch_size,
-    shuffle => 1,
-    last_batch_handle => 'discard'
-);
-my $eval_data = mx->io->NDArrayIter(
-    data  => Hash::Ordered->new(csr_data => $val_csr, val_data => $val_dns),
-    label => Hash::Ordered->new(softmax_label => $val_label), 
-    batch_size => $batch_size,
-    shuffle => 0,
-    last_batch_handle => 'discard'
-);
-
-# module
-my $mod = mx->mod->Module(
-    symbol => $model, context => $ctx, data_names=>['csr_data', 'dns_data'],
-    label_names => ['softmax_label']
-);
-$mod->bind(data_shapes => $train_data->provide_data, label_shapes => $train_data->provide_label);
-$mod->init_params();
-my $optim = mx->optimizer->create($optimizer, learning_rate=>$lr, rescale_grad=>1/$batch_size);
-$mod->init_optimizer(optimizer=>$optim);
-# use accuracy as the metric
-my $metric = mx->metric->create('acc');
-# get the sparse weight parameter
-my $speedometer = mx->callback->Speedometer($batch_size, $log_interval);
-
-print "Training started ...\n";
-
-for my $epoch (0..$num_epoch-1)
-{
-    my $nbatch = 0;
-    $metric->reset;
-    while(my $batch = <$train_data>)
-    {
-        $nbatch++;
-        $mod->forward_backward($batch);
-        # update all parameters (including the weight parameter)
-        $mod->update;
-        # update training metric
-        $mod->update_metric($metric, $batch->label);
-        my $speedometer_param = AI::MXNet::BatchEndParam->new(
-            epoch=>$epoch, nbatch=>$nbatch,
-            eval_metric=>$metric
-        );
-        $speedometer->($speedometer_param);
-    }
-    # evaluate metric on validation dataset
-    my $score = $mod->score($eval_data, 'acc');
-    printf("epoch %d, validation accuracy = %.4f\n", $epoch, $score->{accuracy});
-
-    $mod->save_checkpoint("checkpoint", $epoch, 1);
-    # reset the iterator for next pass of data
-    $train_data->reset;
-}
-
-print "Training completed.\n";
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet.pm b/perl-package/AI-MXNet/lib/AI/MXNet.pm
index 9c8f68277efe..b453cc54c6ac 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet.pm
@@ -77,136 +77,6 @@ AI::MXNet - Perl interface to MXNet machine learning library
 
 =head1 SYNOPSIS
 
-    ## Convolutional NN for recognizing hand-written digits in MNIST dataset
-    ## It's considered "Hello, World" for Neural Networks
-    ## For more info about the MNIST problem please refer to L<http://neuralnetworksanddeeplearning.com/chap1.html>
-
-    use strict;
-    use warnings;
-    use AI::MXNet qw(mx);
-    use AI::MXNet::TestUtils qw(GetMNIST_ubyte);
-    use Test::More tests => 1;
-
-    # symbol net
-    my $batch_size = 100;
-
-    ### model
-    my $data = mx->symbol->Variable('data');
-    my $conv1= mx->symbol->Convolution(data => $data, name => 'conv1', num_filter => 32, kernel => [3,3], stride => [2,2]);
-    my $bn1  = mx->symbol->BatchNorm(data => $conv1, name => "bn1");
-    my $act1 = mx->symbol->Activation(data => $bn1, name => 'relu1', act_type => "relu");
-    my $mp1  = mx->symbol->Pooling(data => $act1, name => 'mp1', kernel => [2,2], stride =>[2,2], pool_type=>'max');
-
-    my $conv2= mx->symbol->Convolution(data => $mp1, name => 'conv2', num_filter => 32, kernel=>[3,3], stride=>[2,2]);
-    my $bn2  = mx->symbol->BatchNorm(data => $conv2, name=>"bn2");
-    my $act2 = mx->symbol->Activation(data => $bn2, name=>'relu2', act_type=>"relu");
-    my $mp2  = mx->symbol->Pooling(data => $act2, name => 'mp2', kernel=>[2,2], stride=>[2,2], pool_type=>'max');
-
-
-    my $fl   = mx->symbol->Flatten(data => $mp2, name=>"flatten");
-    my $fc1  = mx->symbol->FullyConnected(data => $fl,  name=>"fc1", num_hidden=>30);
-    my $act3 = mx->symbol->Activation(data => $fc1, name=>'relu3', act_type=>"relu");
-    my $fc2  = mx->symbol->FullyConnected(data => $act3, name=>'fc2', num_hidden=>10);
-    my $softmax = mx->symbol->SoftmaxOutput(data => $fc2, name => 'softmax');
-
-    # check data
-    GetMNIST_ubyte();
-
-    my $train_dataiter = mx->io->MNISTIter({
-        image=>"data/train-images-idx3-ubyte",
-        label=>"data/train-labels-idx1-ubyte",
-        data_shape=>[1, 28, 28],
-        batch_size=>$batch_size, shuffle=>1, flat=>0, silent=>0, seed=>10});
-    my $val_dataiter = mx->io->MNISTIter({
-        image=>"data/t10k-images-idx3-ubyte",
-        label=>"data/t10k-labels-idx1-ubyte",
-        data_shape=>[1, 28, 28],
-        batch_size=>$batch_size, shuffle=>1, flat=>0, silent=>0});
-
-    my $n_epoch = 1;
-    my $mod = mx->mod->new(symbol => $softmax);
-    $mod->fit(
-        $train_dataiter,
-        eval_data => $val_dataiter,
-        optimizer_params=>{learning_rate=>0.01, momentum=> 0.9},
-        num_epoch=>$n_epoch
-    );
-    my $res = $mod->score($val_dataiter, mx->metric->create('acc'));
-    ok($res->{accuracy} > 0.8);
-
-    ## Gluon MNIST example
-
-    my $net = nn->Sequential();
-    $net->name_scope(sub {
-        $net->add(nn->Dense(128, activation=>'relu'));
-        $net->add(nn->Dense(64, activation=>'relu'));
-        $net->add(nn->Dense(10));
-    });
-    $net->hybridize;
-
-    # data
-    sub transformer
-    {
-        my ($data, $label) = @_;
-        $data = $data->reshape([-1])->astype('float32')/255;
-        return ($data, $label);
-    }
-    my $train_data = gluon->data->DataLoader(
-        gluon->data->vision->MNIST('./data', train=>1, transform => \&transformer),
-        batch_size=>$batch_size, shuffle=>1, last_batch=>'discard'
-    );
-
-    ## training
-    sub train
-    {
-        my ($epochs, $ctx) = @_;
-        # Collect all parameters from net and its children, then initialize them.
-        $net->initialize(mx->init->Xavier(magnitude=>2.24), ctx=>$ctx);
-        # Trainer is for updating parameters with gradient.
-        my $trainer = gluon->Trainer($net->collect_params(), 'sgd', { learning_rate => $lr, momentum => $momentum });
-        my $metric = mx->metric->Accuracy();
-        my $loss = gluon->loss->SoftmaxCrossEntropyLoss();
-
-        for my $epoch (0..$epochs-1)
-        {
-            # reset data iterator and metric at begining of epoch.
-            $metric->reset();
-            enumerate(sub {
-                my ($i, $d) = @_;
-                my ($data, $label) = @$d;
-                $data = $data->as_in_context($ctx);
-                $label = $label->as_in_context($ctx);
-                # Start recording computation graph with record() section.
-                # Recorded graphs can then be differentiated with backward.
-                my $output;
-                autograd->record(sub {
-                    $output = $net->($data);
-                    my $L = $loss->($output, $label);
-                    $L->backward;
-                });
-                # take a gradient step with batch_size equal to data.shape[0]
-                $trainer->step($data->shape->[0]);
-                # update metric at last.
-                $metric->update([$label], [$output]);
-
-                if($i % $log_interval == 0 and $i > 0)
-                {
-                    my ($name, $acc) = $metric->get();
-                    print "[Epoch $epoch Batch $i] Training: $name=$acc\n";
-                }
-            }, \@{ $train_data });
-
-            my ($name, $acc) = $metric->get();
-            print "[Epoch $epoch] Training: $name=$acc\n";
-
-            my ($val_name, $val_acc) = test($ctx);
-            print "[Epoch $epoch] Validation: $val_name=$val_acc\n"
-        }
-        $net->save_parameters('mnist.params');
-    }
-
-    train($epochs, $cuda ? mx->gpu(0) : mx->cpu);
-
 =head1 DESCRIPTION
 
     Perl interface to MXNet machine learning library.
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Module/Bucketing.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Module/Bucketing.pm
index 423b0aec9964..a8c482e3f178 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Module/Bucketing.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Module/Bucketing.pm
@@ -28,83 +28,6 @@ AI::MXNet::Module::Bucketing
 
 =head1 SYNOPSIS
 
-    my $buckets = [10, 20, 30, 40, 50, 60];
-    my $start_label   = 1;
-    my $invalid_label = 0;
-
-    my ($train_sentences, $vocabulary) = tokenize_text(
-        './data/sherlockholmes.train.txt', start_label => $start_label,
-        invalid_label => $invalid_label
-    );
-    my ($validation_sentences) = tokenize_text(
-        './data/sherlockholmes.test.txt', vocab => $vocabulary,
-        start_label => $start_label, invalid_label => $invalid_label
-    );
-    my $data_train  = mx->rnn->BucketSentenceIter(
-        $train_sentences, $batch_size, buckets => $buckets,
-        invalid_label => $invalid_label
-    );
-    my $data_val    = mx->rnn->BucketSentenceIter(
-        $validation_sentences, $batch_size, buckets => $buckets,
-        invalid_label => $invalid_label
-    );
-
-    my $stack = mx->rnn->SequentialRNNCell();
-    for my $i (0..$num_layers-1)
-    {
-        $stack->add(mx->rnn->LSTMCell(num_hidden => $num_hidden, prefix => "lstm_l${i}_"));
-    }
-
-    my $sym_gen = sub {
-        my $seq_len = shift;
-        my $data  = mx->sym->Variable('data');
-        my $label = mx->sym->Variable('softmax_label');
-        my $embed = mx->sym->Embedding(
-            data => $data, input_dim => scalar(keys %$vocabulary),
-            output_dim => $num_embed, name => 'embed'
-        );
-        $stack->reset;
-        my ($outputs, $states) = $stack->unroll($seq_len, inputs => $embed, merge_outputs => 1);
-        my $pred = mx->sym->Reshape($outputs, shape => [-1, $num_hidden]);
-        $pred    = mx->sym->FullyConnected(data => $pred, num_hidden => scalar(keys %$vocabulary), name => 'pred');
-        $label   = mx->sym->Reshape($label, shape => [-1]);
-        $pred    = mx->sym->SoftmaxOutput(data => $pred, label => $label, name => 'softmax');
-        return ($pred, ['data'], ['softmax_label']);
-    };
-
-    my $contexts;
-    if(defined $gpus)
-    {
-        $contexts = [map { mx->gpu($_) } split(/,/, $gpus)];
-    }
-    else
-    {
-        $contexts = mx->cpu(0);
-    }
-
-    my $model = mx->mod->BucketingModule(
-        sym_gen             => $sym_gen,
-        default_bucket_key  => $data_train->default_bucket_key,
-        context             => $contexts
-    );
-
-    $model->fit(
-        $data_train,
-        eval_data           => $data_val,
-        eval_metric         => mx->metric->Perplexity($invalid_label),
-        kvstore             => $kv_store,
-        optimizer           => $optimizer,
-        optimizer_params    => {
-                                    learning_rate => $lr,
-                                    momentum      => $mom,
-                                    wd            => $wd,
-                            },
-        initializer         => mx->init->Xavier(factor_type => "in", magnitude => 2.34),
-        num_epoch           => $num_epoch,
-        batch_end_callback  => mx->callback->Speedometer($batch_size, $disp_batches),
-        ($chkp_epoch ? (epoch_end_callback  => mx->rnn->do_rnn_checkpoint($stack, $chkp_prefix, $chkp_epoch)) : ())
-    );
-
 =head1 DESCRIPTION
 
     Implements the AI::MXNet::Module::Base API, and allows multiple
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/TestUtils.pm b/perl-package/AI-MXNet/lib/AI/MXNet/TestUtils.pm
index 2b7249fea8ee..45f13dbf4e53 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/TestUtils.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/TestUtils.pm
@@ -28,7 +28,7 @@ use AI::MXNet::Base;
 use Exporter;
 use base qw(Exporter);
 our @EXPORT_OK = qw(same reldiff almost_equal GetMNIST_ubyte
-                    GetCifar10 pdl_maximum pdl_minimum mlp2 conv dies_ok
+                    GetCifar10 pdl_maximum pdl_minimum mlp2 dies_ok
                     check_consistency zip assert enumerate same_array dies_like allclose rand_shape_2d
                     rand_shape_3d rand_sparse_ndarray random_arrays rand_ndarray randint pdl);
 use constant default_numerical_threshold => 1e-6;
@@ -183,25 +183,6 @@ func mlp2()
     return $out;
 }
 
-func conv()
-{
-    my $data    = AI::MXNet::Symbol->Variable('data');
-    my $conv1   = AI::MXNet::Symbol->Convolution(data => $data, name=>'conv1', num_filter=>32, kernel=>[3,3], stride=>[2,2]);
-    my $bn1     = AI::MXNet::Symbol->BatchNorm(data => $conv1, name=>"bn1");
-    my $act1    = AI::MXNet::Symbol->Activation(data => $bn1, name=>'relu1', act_type=>"relu");
-    my $mp1     = AI::MXNet::Symbol->Pooling(data => $act1, name => 'mp1', kernel=>[2,2], stride=>[2,2], pool_type=>'max');
-
-    my $conv2   = AI::MXNet::Symbol->Convolution(data => $mp1, name=>'conv2', num_filter=>32, kernel=>[3,3], stride=>[2,2]);
-    my $bn2     = AI::MXNet::Symbol->BatchNorm(data => $conv2, name=>"bn2");
-    my $act2    = AI::MXNet::Symbol->Activation(data => $bn2, name=>'relu2', act_type=>"relu");
-    my $mp2     = AI::MXNet::Symbol->Pooling(data => $act2, name => 'mp2', kernel=>[2,2], stride=>[2,2], pool_type=>'max');
-
-    my $fl      = AI::MXNet::Symbol->Flatten(data => $mp2, name=>"flatten");
-    my $fc2     = AI::MXNet::Symbol->FullyConnected(data => $fl, name=>'fc2', num_hidden=>10);
-    my $softmax = AI::MXNet::Symbol->SoftmaxOutput(data => $fc2, name => 'sm');
-    return $softmax;
-}
-
 =head2 check_consistency
 
     Check symbol gives the same output for different running context
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm
index 90ec1da4e289..3abfda15eba0 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Visualization.pm
@@ -52,10 +52,9 @@ use JSON::PP;
     my $fc1  = mx->symbol->FullyConnected(data => $fl,  name=>"fc1", num_hidden=>30);
     my $act3 = mx->symbol->Activation(data => $fc1, name=>'relu3', act_type=>"relu");
     my $fc2  = mx->symbol->FullyConnected(data => $act3, name=>'fc2', num_hidden=>10);
-    my $softmax = mx->symbol->SoftmaxOutput(data => $fc2, name => 'softmax');
 
     ## creates the image file working directory
-    mx->viz->plot_network($softmax, save_format => 'png')->render("network.png");
+    mx->viz->plot_network($fc2, save_format => 'png')->render("network.png");
 
 =head1 DESCRIPTION
 
diff --git a/perl-package/AI-MXNet/t/test_conv.t b/perl-package/AI-MXNet/t/test_conv.t
deleted file mode 100644
index 19c302bc87e0..000000000000
--- a/perl-package/AI-MXNet/t/test_conv.t
+++ /dev/null
@@ -1,72 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use AI::MXNet qw(mx);
-use AI::MXNet::TestUtils qw(GetMNIST_ubyte);
-use Test::More tests => 1;
-
-## speed up the tests when gpu present
-my $gpu_present = mx->context->num_gpus;
-
-# symbol net
-my $batch_size = 100;
-
-### model
-my $data = mx->symbol->Variable('data');
-my $conv1= mx->symbol->Convolution(data => $data, name => 'conv1', num_filter => 32, kernel => [3,3], stride => [2,2]);
-my $bn1  = mx->symbol->BatchNorm(data => $conv1, name => "bn1");
-my $act1 = mx->symbol->Activation(data => $bn1, name => 'relu1', act_type => "relu");
-my $mp1  = mx->symbol->Pooling(data => $act1, name => 'mp1', kernel => [2,2], stride =>[2,2], pool_type=>'max');
-
-my $conv2= mx->symbol->Convolution(data => $mp1, name => 'conv2', num_filter => 32, kernel=>[3,3], stride=>[2,2]);
-my $bn2  = mx->symbol->BatchNorm(data => $conv2, name=>"bn2");
-my $act2 = mx->symbol->Activation(data => $bn2, name=>'relu2', act_type=>"relu");
-my $mp2  = mx->symbol->Pooling(data => $act2, name => 'mp2', kernel=>[2,2], stride=>[2,2], pool_type=>'max');
-
-
-my $fl   = mx->symbol->Flatten(data => $mp2, name=>"flatten");
-my $fc1  = mx->symbol->FullyConnected(data => $fl,  name=>"fc1", num_hidden=>30);
-my $act3 = mx->symbol->Activation(data => $fc1, name=>'relu3', act_type=>"relu");
-my $fc2  = mx->symbol->FullyConnected(data => $act3, name=>'fc2', num_hidden=>10);
-my $softmax = mx->symbol->SoftmaxOutput(data => $fc2, name => 'softmax');
-
-# check data
-GetMNIST_ubyte();
-
-my $train_dataiter = mx->io->MNISTIter({
-        image=>"data/train-images-idx3-ubyte",
-        label=>"data/train-labels-idx1-ubyte",
-        data_shape=>[1, 28, 28],
-        batch_size=>$batch_size, shuffle=>1, flat=>0, silent=>0, seed=>10});
-my $val_dataiter = mx->io->MNISTIter({
-        image=>"data/t10k-images-idx3-ubyte",
-        label=>"data/t10k-labels-idx1-ubyte",
-        data_shape=>[1, 28, 28],
-        batch_size=>$batch_size, shuffle=>1, flat=>0, silent=>0});
-
-my $n_epoch = 1;
-my $mod = mx->mod->new(symbol => $softmax, ($gpu_present ? (context => mx->gpu(0)) : ()));
-$mod->fit(
-    $train_dataiter,
-    eval_data => $val_dataiter,
-    optimizer_params=>{learning_rate=>0.01, momentum=> 0.9},
-    num_epoch=>$n_epoch
-);
-my $res = $mod->score($val_dataiter, mx->metric->create('acc'));
-ok($res->{accuracy} > 0.8);
diff --git a/perl-package/AI-MXNet/t/test_module.t b/perl-package/AI-MXNet/t/test_module.t
index 55e098683399..2b5e72463275 100644
--- a/perl-package/AI-MXNet/t/test_module.t
+++ b/perl-package/AI-MXNet/t/test_module.t
@@ -17,7 +17,7 @@
 
 use strict;
 use warnings;
-use Test::More tests => 428;
+use Test::More tests => 19;
 use AI::MXNet qw(mx);
 use AI::MXNet::Base;
 use AI::MXNet::TestUtils qw(almost_equal enumerate same_array dies_like rand_ndarray);
@@ -173,124 +173,6 @@ sub test_module_states
     }
 }
 
-sub test_module_switch_bucket
-{
-    my $vocab_dim  = 5000;
-    my $num_hidden = 100;
-    my $num_embedding = 100;
-    my $num_layer = 2;
-    my $default_key = 10;
-    my $test_key = 5;
-    my $batch_size = 32;
-    my $contexts = [mx->cpu(0)];
-    my $initializer = mx->init->Xavier(factor_type=>"in", magnitude=>2.34);
-
-    #generate symbols for an LSTM network
-    my $gen_sym = sub {
-        my $seq_len = shift;
-        my $data  = mx->sym->Variable('data');
-        my $label = mx->sym->Variable('softmax_label');
-        my $embed = mx->sym->Embedding(data=>$data, input_dim=>$vocab_dim,
-                                 output_dim=>$num_embedding, name=>'embed');
-        my $stack = mx->rnn->SequentialRNNCell();
-        for my $i (0..$num_layer-1)
-        {
-            $stack->add(mx->rnn->LSTMCell(num_hidden=>$num_hidden, prefix=>"lstm_l${i}_"));
-        }
-        my ($outputs, $states) = $stack->unroll($seq_len, inputs=>$embed, merge_outputs=>1);
-
-        my $pred = mx->sym->Reshape($outputs, shape=>[-1, $num_hidden]);
-        $pred = mx->sym->FullyConnected(data=>$pred, num_hidden=>$vocab_dim, name=>'pred');
-
-        $label = mx->sym->Reshape($label, shape=>[-1]);
-        $pred = mx->sym->SoftmaxOutput(data=>$pred, label=>$label, name=>'softmax');
-
-        return ($pred, ['data'], ['softmax_label']);
-    };
-    my $create_bucketing_module = sub { my $key = shift;
-        my $model = mx->mod->BucketingModule(
-            sym_gen             => $gen_sym,
-            default_bucket_key  => $key,
-            context             => $contexts
-        );
-        $model->bind(data_shapes=>[['data', [$batch_size, $key]]],
-                    label_shapes=>[['softmax_label', [$batch_size, $key]]]
-        );
-        $model->init_params(initializer=>$initializer);
-        return $model;
-    };
-    #initialize the bucketing module with the default bucket key
-    my $bucketing_model = $create_bucketing_module->($default_key);
-    #switch to test_key
-    $bucketing_model->switch_bucket(
-        bucket_key   => $test_key,
-        data_shapes  => [['data', [$batch_size, $test_key]]],
-        label_shapes => [['softmax_label', [$batch_size, $test_key]]]
-    );
-
-    delete $bucketing_model->_buckets->{$test_key};
-
-    $bucketing_model->switch_bucket(
-        bucket_key   => $test_key,
-        data_shapes  => [['data', [$batch_size, $test_key]]],
-        label_shapes => [['softmax_label', [$batch_size, $test_key]]]
-    );
-}
-
-sub test_monitor
-{
-    mx->random->seed(11);
-    my $data = mx->nd->array([[0.05, .10]]);
-    my $label = mx->nd->array([[.01, 0.99]]);
-    my $train_data = mx->io->NDArrayIter($data, label => $label, batch_size=>1);
-
-    # symbols
-    my $x = mx->symbol->Variable('data');
-    $x = mx->symbol->FullyConnected(name=>'fc_0', data=>$x, num_hidden=>2);
-    $x = mx->symbol->Activation(name=>"act_0", data=>$x, act_type=>'sigmoid');
-    $x = mx->symbol->FullyConnected(name=>'fc_1', data=>$x, num_hidden=>2);
-    $x = mx->symbol->Activation(name=>"act_1", data=>$x, act_type=>'sigmoid');
-    $x = mx->symbol->LinearRegressionOutput(data=>$x, name=>'softmax', grad_scale=>2);
-
-    # create monitor
-    my $mean_abs = sub { my ($x) = @_;
-        return $x->abs->sum/$x->size;
-    };
-    my $mon = mx->mon->Monitor(1, stat_func=>$mean_abs, pattern=>'.*', sort=>1);
-
-    # create module
-    my $mod = mx->mod->Module($x, context=>[mx->cpu()]);
-    $mod->bind(data_shapes=>$train_data->provide_data, label_shapes=>$train_data->provide_label,
-                    for_training=>1);
-    $mod->install_monitor($mon);
-    my $arg_params = {fc_0_weight => mx->nd->array([[.15, .20], [.25, .30]]),
-                  fc_0_bias  => mx->nd->array([.35, .35]),
-                  fc_1_weight => mx->nd->array([[.40, .45], [.50, .55]]),
-                  fc_1_bias  => mx->nd->array([.60, .60])};
-    $mod->init_params(arg_params=>$arg_params);
-
-    my $data_batch = <$train_data>;
-    $mon->tic();
-    $mod->forward_backward($data_batch);
-    my $res = $mon->toc();
-    my $keys = ['act_0', 'act_1', 'data', 'fc_0', 'fc_1', 'softmax'];
-    my $mon_result_counts = [0, 0, 0, 0, 0, 0];
-    ok(@$res == 21);
-    for my $r (@$res)
-    {
-        my ($n, $k, $v) = @$r;
-        enumerate(sub {
-            my ($idx, $key) = @_;
-            if($k =~ /^$key/)
-            {
-                $mon_result_counts->[$idx] += 1;
-                return;
-            }
-        }, $keys);
-    }
-    is_deeply($mon_result_counts, [2, 2, 1, 6, 6, 4]);
-}
-
 sub test_module_dtype
 {
     my $dtype = 'float16';
@@ -351,445 +233,6 @@ sub test_module_input_grads
     ok(($c_grad == 3)->all);
 }
 
-sub test_executor_group
-{
-    my $get_rnn_sym = sub { my ($num_layers, $num_words, $num_hidden, $num_embed, $seq_len, $sparse_embedding) = @_;
-        my $stack = mx->rnn->SequentialRNNCell();
-        for my $i (0..$num_layers-1)
-        {
-            $stack->add(mx->rnn->LSTMCell(num_hidden=>$num_hidden, prefix=>"lstm_l${i}_"));
-        }
-        my $data = mx->sym->Variable('data');
-        my $label = mx->sym->Variable('softmax_label');
-        my $embed;
-        if($sparse_embedding)
-        {
-            my $embed_weight = mx->sym->Variable('embed_weight', stype=>'row_sparse');
-            $embed = mx->sym->contrib->SparseEmbedding(data=>$data, input_dim=>$num_words,
-                                                   weight=>$embed_weight, output_dim=>$num_embed,
-                                                   name=>'embed');
-
-        }
-        else
-        {
-            $embed = mx->sym->Embedding(data=>$data, input_dim=>$num_words,
-                                 output_dim=>$num_embed, name=>'embed');
-        }
-
-        $stack->reset();
-        my ($outputs, $states) = $stack->unroll($seq_len, inputs=>$embed, merge_outputs=>1);
-
-        my $pred = mx->sym->Reshape($outputs, shape=>[-1, $num_hidden]);
-        $pred = mx->sym->FullyConnected(data=>$pred, num_hidden=>$num_words, name=>'pred');
-
-        $label = mx->sym->Reshape($label, shape=>[-1]);
-        $pred = mx->sym->SoftmaxOutput(data=>$pred, label=>$label, name=>'softmax');
-        return $pred;
-    };
-
-    my $test_shared_exec_group = sub { my ($exec_grp_shared, $exec_grp_created, $shared_arg_names, $extra_args, $check_grads) = @_;
-        # Test shared data arrays
-        for my $i (0..@{ $exec_grp_shared->execs }-1)
-        {
-            # test same shared_data_arrays for two exec groups
-            my $shared_data_array1 = $exec_grp_shared->shared_data_arrays->[$i];
-            my $shared_data_array2 = $exec_grp_created->shared_data_arrays->[$i];
-            if(defined $extra_args)
-            {
-                ok(keys(%$shared_data_array1) == @$extra_args);
-            }
-            ok(keys(%$shared_data_array1) == keys(%$shared_data_array2));
-            while(my ($k, $v) = each %{ $shared_data_array1 })
-            {
-                if(defined $extra_args)
-                {
-                    ok(grep { $_ eq $k } @$extra_args);
-                }
-                ok(exists $shared_data_array2->{$k});
-                ok(same_array($v, $shared_data_array2->{$k}));
-            }
-            # Test shared argument arrays and gradient arrays
-            my $exec_shared  = $exec_grp_shared->execs->[$i];
-            my $exec_created = $exec_grp_created->execs->[$i];
-            if(defined $shared_arg_names)
-            {
-                # test shared arguments
-                for my $arg_name (@$shared_arg_names)
-                {
-                    ok(exists $exec_created->arg_dict->{$arg_name});
-                    ok(same_array($exec_shared->arg_dict->{$arg_name}, $exec_created->arg_dict->{$arg_name}));
-                }
-                # test shared argument gradients
-                for my $arg_name (@$shared_arg_names)
-                {
-                    if($check_grads)
-                    {
-                        ok(exists $exec_created->grad_dict->{$arg_name});
-                        ok(same_array($exec_shared->grad_dict->{$arg_name}, $exec_created->grad_dict->{$arg_name}));
-                    }
-                }
-            }
-            my $grad_req = $exec_grp_shared->grad_req;
-            while(my ($arg_name, $grad) = each %{ $grad_req })
-            {
-                ok($grad eq $exec_grp_created->grad_req->{$arg_name});
-            }
-        }
-    };
-
-    for my $sparse_embedding (0, 1)
-    {
-        my $contexts = [mx->cpu(0), mx->cpu(1)];
-        my $workload = [(1) x scalar(@$contexts)];
-        my $batch_size = 32;
-        my $max_bucket_size = 80;
-        my $num_words = 1000;
-        my $num_hidden = 100;
-        my $num_embed = 200;
-        my $data_shapes = [['data', [$batch_size, $max_bucket_size]]];
-        my $label_shapes = [['softmax_label', [$batch_size, $max_bucket_size]]];
-
-        # generate an rnn sym with #layers=5
-        my $sym = $get_rnn_sym->(3, $num_words, $num_hidden,
-                      $num_embed, $max_bucket_size, $sparse_embedding);
-        my $arg_names1 = $sym->list_arguments();
-        my $input_names = ['data', 'softmax_label'];
-        my $shared_arg_names = [grep { !/^(?:data|softmax_label)$/ } @$arg_names1];
-        my $exec_group1 = AI::MXNet::DataParallelExecutorGroup->new(
-            symbol=>$sym, contexts=>$contexts,
-            workload=>$workload, data_shapes=>$data_shapes,
-            label_shapes=>$label_shapes, param_names=>$shared_arg_names,
-            for_training=>1, inputs_need_grad=>0
-        );
-        # shared_data_arrays should only have input "data" and "softmax_label" arrays
-        for my $i (0..@{$contexts}-1)
-        {
-            ok(keys(%{$exec_group1->shared_data_arrays->[$i]}) == @$input_names);
-            for my $name (@$input_names)
-            {
-                ok(exists $exec_group1->shared_data_arrays->[$i]->{$name});
-            }
-        }
-        # generate an rnn sym with #layers=5
-        $sym = $get_rnn_sym->(5, $num_words, $num_hidden,
-                         $num_embed, $max_bucket_size, $sparse_embedding);
-        my $arg_names2 = $sym->list_arguments();
-        my $exec_group2 = AI::MXNet::DataParallelExecutorGroup->new(symbol=>$sym, contexts=>$contexts,
-                                            workload=>$workload, data_shapes=>$data_shapes,
-                                            label_shapes=>$label_shapes, param_names=>$shared_arg_names,
-                                            for_training=>1, inputs_need_grad=>0,
-                                            shared_group=>$exec_group1);
-        my %shared_arg_names = map { $_ => 1 } @$shared_arg_names;
-        my $extra_args = [grep { not exists $shared_arg_names{$_} } @$arg_names2];
-        $test_shared_exec_group->(
-            $exec_group1, $exec_group2,
-            $shared_arg_names, $extra_args, not $sparse_embedding
-        );
-    }
-}
-
-sub test_factorization_machine_module
-{
-    mx->random->seed(11);
-    my $check_factorization_machine_module = sub { my ($optimizer, $num_epochs) = @_;
-        my $fm = sub { my ($factor_size, $feature_dim, $init) = @_;
-            my $x = mx->symbol->Variable("data", stype=>'csr');
-            my $v = mx->symbol->Variable("v", shape=>[$feature_dim, $factor_size],
-                                   init=>$init, stype=>'row_sparse');
-
-            my $w1_weight = mx->symbol->var('w1_weight', shape=>[$feature_dim, 1],
-                                      init=>$init, stype=>'row_sparse');
-            my $w1_bias = mx->symbol->var('w1_bias', shape=>[1]);
-            my $w1 = mx->symbol->broadcast_add(mx->symbol->dot($x, $w1_weight), $w1_bias);
-
-            my $v_s = mx->symbol->_square_sum(data=>$v, axis=>1, keepdims=>1);
-            my $x_s = mx->symbol->square(data=>$x);
-            my $bd_sum = mx->sym->dot($x_s, $v_s);
-
-            my $w2 = mx->symbol->dot($x, $v);
-            my $w2_squared = 0.5 * mx->symbol->square(data=>$w2);
-
-            my $w_all = mx->symbol->Concat($w1, $w2_squared, dim=>1);
-            my $sum1 = mx->symbol->sum(data=>$w_all, axis=>1, keepdims=>1);
-            my $sum2 = 0.5 * mx->symbol->negative($bd_sum);
-            my $model = mx->sym->elemwise_add($sum1, $sum2);
-
-            my $y = mx->symbol->Variable("label");
-            $model = mx->symbol->LinearRegressionOutput(data=>$model, label=>$y);
-            return $model
-        };
-
-        # model
-        my $init = mx->initializer->Normal(sigma=>0.01);
-        my $factor_size = 4;
-        my $feature_dim = 10000;
-        my $model = $fm->($factor_size, $feature_dim, $init);
-
-        # data iter
-        my $num_batches = 5;
-        my $batch_size = 64;
-        my $num_samples = $batch_size * $num_batches;
-        # generate some random csr data
-        my $csr_nd = rand_ndarray([$num_samples, $feature_dim], 'csr', 0.1);
-        my $label = mx->nd->ones([$num_samples,1]);
-        # the alternative is to use LibSVMIter
-        my $train_iter = mx->io->NDArrayIter(data=>$csr_nd,
-                                       label=>Hash::Ordered->new(label => $label),
-                                       batch_size=>$batch_size,
-                                       last_batch_handle=>'discard');
-        # create module
-        my $mod = mx->mod->Module(symbol=>$model, data_names=>['data'], label_names=>['label']);
-        # allocate memory by given the input data and lable shapes
-        $mod->bind(data_shapes=>$train_iter->provide_data, label_shapes=>$train_iter->provide_label);
-        # initialize parameters by uniform random numbers
-        $mod->init_params(initializer=>$init);
-        my $expected_accuracy;
-        if($optimizer eq 'sgd')
-        {
-            # use Sparse SGD with learning rate 0.1 to train
-            my $sgd = mx->optimizer->SGD(momentum=>0.1, clip_gradient=>5.0, learning_rate=>0.01,
-                                   rescale_grad=>1.0/$batch_size);
-            $mod->init_optimizer(optimizer=>$sgd);
-            $num_epochs //= 10;
-            $expected_accuracy = 0.02;
-        }
-        elsif($optimizer eq 'adam')
-        {
-            # use Sparse Adam to train
-            my $adam = mx->optimizer->Adam(clip_gradient=>5.0, learning_rate=>0.0005,
-                                     rescale_grad=>1.0/$batch_size);
-            $mod->init_optimizer(optimizer=>$adam);
-            $num_epochs //= 10;
-            $expected_accuracy = 0.05;
-        }
-        elsif($optimizer eq 'adagrad')
-        {
-            # use Sparse AdaGrad with learning rate 0.1 to train
-            my $adagrad = mx->optimizer->AdaGrad(clip_gradient=>5.0, learning_rate=>0.01,
-                                           rescale_grad=>1.0/$batch_size);
-            $mod->init_optimizer(optimizer=>$adagrad);
-            $num_epochs //= 20;
-            $expected_accuracy = 0.09;
-        }
-        else
-        {
-            die "Unsupported optimizer type $optimizer specified";
-        }
-        # use accuracy as the metric
-        my $metric = mx->metric->create('MSE');
-        # train 'num_epochs' epoch
-        for my $epoch (1..$num_epochs)
-        {
-            $train_iter->reset();
-            $metric->reset();
-            while(my $batch = <$train_iter>)
-            {
-                $mod->forward($batch, is_train=>1);       # compute predictions
-                $mod->update_metric($metric, $batch->label);  # accumulate prediction accuracy
-                $mod->backward();                          # compute gradients
-                $mod->update();                            # update parameters
-            }
-        }
-        if($num_epochs > 1)
-        {
-            ok(($metric->get)[1] < $expected_accuracy);
-        }
-    };
-
-    $check_factorization_machine_module->('sgd');
-    $check_factorization_machine_module->('adam');
-    $check_factorization_machine_module->('adagrad');
-}
-
-
-sub test_module_initializer
-{
-    my $regression_model = sub { my ($m) = @_;
-         my $x = mx->symbol->var("data", stype=>'csr');
-         my $v = mx->symbol->var("v", shape=>[$m, 1], init=>mx->init->Uniform(scale=>.1),
-                                stype=>'row_sparse');
-         my $model = mx->symbol->dot(lhs=>$x, rhs=>$v);
-         my $y = mx->symbol->Variable("label");
-         $model = mx->symbol->LinearRegressionOutput(data=>$model, label=>$y, name=>"out");
-         return $model
-    };
-
-    my ($n, $m) = (128, 100);
-    my $model = $regression_model->($m);
-
-    my $data = mx->nd->zeros([$n, $m], stype=>'csr');
-    my $label = mx->nd->zeros([$n, 1]);
-    my $iterator = mx->io->NDArrayIter(data=>$data, label=>Hash::Ordered->new(label => $label),
-                                 batch_size=>$n, last_batch_handle=>'discard');
-
-    # create module
-    my $mod = mx->mod->Module(symbol=>$model, data_names=>['data'], label_names=>['label']);
-    $mod->bind(data_shapes=>$iterator->provide_data, label_shapes=>$iterator->provide_label);
-    $mod->init_params();
-    my $v = $mod->_arg_params->{v};
-    ok($v->stype eq 'row_sparse');
-    ok($v->aspdl->sum != 0);
-}
-
-sub test_module_set_params
-{
-    # data iter
-    mx->random->seed(11);
-    my $data = mx->nd->array([[0.05, .10]]);
-    my $label = mx->nd->array([[.01, 0.99]]);
-    my $train_data = mx->io->NDArrayIter(data => $data, label => $label, batch_size => 1);
-
-    # symbols
-    my $x = mx->symbol->Variable('data');
-    $x = mx->symbol->FullyConnected(name=>'fc_0', data=>$x, num_hidden=>2);
-    $x = mx->symbol->Activation(name=>"act_0", data=>$x, act_type=>'sigmoid');
-    $x = mx->symbol->FullyConnected(name=>'fc_1', data=>$x, num_hidden=>2);
-    $x = mx->symbol->Activation(name=>"act_1", data=>$x, act_type=>'sigmoid');
-    $x = mx->symbol->LinearRegressionOutput(data=>$x, name=>'softmax', grad_scale=>2);
-
-    # create module
-    my $mod = mx->mod->Module($x, context=>[mx->cpu()]);
-    $mod->bind(data_shapes => $train_data->provide_data, label_shapes=>$train_data->provide_label,
-             for_training=>1);
-
-    my $arg_params_correct = {fc_0_weight => mx->nd->array([[.15, .20], [.25, .30]]),
-                  fc_0_bias => mx->nd->array([.35, .35]),
-                  fc_1_weight =>  mx->nd->array([[.40, .45], [.50, .55]]),
-                  fc_1_bias  => mx->nd->array([.60, .60])};
-
-    my $arg_params_missing = {fc_0_weight => mx->nd->array([[.15, .20], [.25, .30]]),
-                  fc_0_bias  => mx->nd->array([.35, .35]),
-                  fc_1_weight => mx->nd->array([[.40, .45], [.50, .55]])};
-
-    my $arg_params_extra = {fc_0_weight => mx->nd->array([[.15, .20], [.25, .30]]),
-                  fc_0_bias  => mx->nd->array([.35, .35]),
-                  fc_1_weight=> mx->nd->array([[.40, .45], [.50, .55]]),
-                  fc_1_bias => mx->nd->array([.60, .60]),
-                  fc_2_weight => mx->nd->array([.60, .60])};
-
-    my $arg_params_missing_extra = {fc_3_weight => mx->nd->array([.60, .60])};
-
-    # test regular set_params
-    $mod->set_params($arg_params_correct, {}, force_init=>1);
-
-    # test allow missing
-    $mod->set_params($arg_params_missing, {}, allow_missing=>1, force_init=>1);
-    ok(dies_like(sub { $mod->set_params($arg_params_missing, {}, force_init=>1, allow_missing=>0); }, qr/fc_/));
-
-    # test allow extra
-    $mod->set_params($arg_params_extra, {}, force_init=>1, allow_missing=>1, allow_extra=>1);
-    ok(dies_like(sub { $mod->set_params($arg_params_extra, {}, force_init=>1, allow_missing=>1, allow_extra=>0); }, qr/fc_/));
-
-    # test allow missing + extra, this will throw a runtime error
-    ok(dies_like(sub { $mod->set_params($arg_params_missing_extra, {}, force_init=>1, allow_missing=>1, allow_extra=>0); }, qr/fc_/));
-}
-
-sub test_forward_reshape
-{
-    my $num_class = 10;
-    my $data1 = mx->sym->Variable('data1');
-    my $data2 = mx->sym->Variable('data2');
-    my $conv1 = mx->sym->Convolution(data=>$data1, kernel=>[2, 2], num_filter=>2, stride=>[2, 2]);
-    my $conv2 = mx->sym->Convolution(data=>$data2, kernel=>[3, 3], num_filter=>3, stride=>[1, 1]);
-    my $pooling1 = mx->sym->Pooling(data=>$conv1, kernel=>[2, 2], stride=>[1, 1], pool_type=>"avg");
-    my $pooling2 = mx->sym->Pooling(data=>$conv2, kernel=>[2, 2], stride=>[1, 1], pool_type=>"max");
-    my $flatten1 = mx->sym->flatten(data=>$pooling1);
-    my $flatten2 = mx->sym->flatten(data=>$pooling2);
-    my $sum = mx->sym->sum(data=>$flatten1, axis=>1) + mx->sym->sum(data=>$flatten2, axis=>1);
-    my $fc = mx->sym->FullyConnected(data=>$sum, num_hidden=>$num_class);
-    my $sym = mx->sym->SoftmaxOutput(data=>$fc, name=>'softmax');
-
-    my $dshape1 = [10, 3, 64, 64];
-    my $dshape2 = [10, 3, 32, 32];
-    my $lshape = [10];
-
-    my $mod = mx->mod->Module(symbol=>$sym, data_names=>['data1', 'data2'],
-                        label_names=>['softmax_label']);
-    $mod->bind(data_shapes=>[['data1', $dshape1], ['data2', $dshape2]],
-             label_shapes=>[['softmax_label', $lshape]]);
-    $mod->init_params();
-    $mod->init_optimizer(optimizer_params=>{learning_rate => 0.01});
-
-    # Train with original data shapes
-    my $data_batch = mx->io->DataBatch(data=>[mx->nd->random_uniform(0, 9, $dshape1),
-                                       mx->nd->random_uniform(5, 15, $dshape2)],
-                                 label=>[mx->nd->ones($lshape)]);
-    $mod->forward($data_batch);
-    is_deeply($mod->get_outputs->[0]->shape, [$lshape->[0], $num_class]);
-    $mod->backward();
-    $mod->update();
-
-    # Train with different batch size
-    $dshape1 = [3, 3, 64, 64];
-    $dshape2 = [3, 3, 32, 32];
-    $lshape = [3];
-    $data_batch = mx->io->DataBatch(data=>[mx->nd->random_uniform(0, 9, $dshape1),
-                                       mx->nd->random_uniform(5, 15, $dshape2)],
-                                 label=>[mx->nd->ones($lshape)]);
-    $mod->forward($data_batch);
-    is_deeply($mod->get_outputs->[0]->shape, [$lshape->[0], $num_class]);
-    $mod->backward();
-    $mod->update();
-
-    $dshape1 = [20, 3, 64, 64];
-    $dshape2 = [20, 3, 32, 32];
-    $lshape = [20];
-    $data_batch = mx->io->DataBatch(data=>[mx->nd->random_uniform(3, 5, $dshape1),
-                                       mx->nd->random_uniform(10, 25, $dshape2)],
-                                 label=>[mx->nd->ones($lshape)]);
-    $mod->forward($data_batch);
-    is_deeply($mod->get_outputs->[0]->shape, [$lshape->[0], $num_class]);
-    $mod->backward();
-    $mod->update();
-
-    #Train with both different batch size and data shapes
-    $dshape1 = [20, 3, 120, 120];
-    $dshape2 = [20, 3, 32, 64];
-    $lshape = [20];
-    $data_batch = mx->io->DataBatch(data=>[mx->nd->random_uniform(0, 9, $dshape1),
-                                       mx->nd->random_uniform(5, 15, $dshape2)],
-                                 label=>[mx->nd->ones($lshape)]);
-    $mod->forward($data_batch);
-    is_deeply($mod->get_outputs->[0]->shape, [$lshape->[0], $num_class]);
-    $mod->backward();
-    $mod->update();
-
-    $dshape1 = [5, 3, 28, 40];
-    $dshape2 = [5, 3, 24, 16];
-    $lshape = [5];
-    $data_batch = mx->io->DataBatch(data=>[mx->nd->random_uniform(0, 9, $dshape1),
-                                       mx->nd->random_uniform(15, 25, $dshape2)],
-                                 label=>[mx->nd->ones($lshape)]);
-    $mod->forward($data_batch);
-    is_deeply($mod->get_outputs->[0]->shape, [$lshape->[0], $num_class]);
-    $mod->backward();
-    $mod->update();
-
-    #Test score
-    my $dataset_shape1 = [30, 3, 30, 30];
-    my $dataset_shape2 = [30, 3, 20, 40];
-    my $labelset_shape = [30];
-
-    my $eval_dataiter = mx->io->NDArrayIter(data=>[mx->nd->random_uniform(0, 9, $dataset_shape1),
-                                            mx->nd->random_uniform(15, 25, $dataset_shape2)],
-                                      label=>[mx->nd->ones($labelset_shape)],
-                                      batch_size=>5);
-    ok(keys %{ $mod->score($eval_dataiter, 'acc') } == 1);
-
-    #Test prediction
-    $dshape1 = [1, 3, 30, 30];
-    $dshape2 = [1, 3, 20, 40];
-    $dataset_shape1 = [10, 3, 30, 30];
-    $dataset_shape2 = [10, 3, 20, 40];
-
-    my $pred_dataiter = mx->io->NDArrayIter(data=>[mx->nd->random_uniform(0, 9, $dataset_shape1),
-                                            mx->nd->random_uniform(15, 25, $dataset_shape2)]);
-    $mod->bind(data_shapes=>[['data1', $dshape1], ['data2', $dshape2]],
-             for_training=>0, force_rebind=>1);
-    is_deeply($mod->predict($pred_dataiter)->shape, [10, $num_class]);
-
-}
-
 sub test_forward_acceptable_input
 {
     my $data = mx->sym->Variable('data');
@@ -803,15 +246,7 @@ sub test_forward_acceptable_input
 
 test_module_input_grads();
 test_module_dtype();
-test_monitor();
-test_module_switch_bucket();
 test_module_layout();
 test_module_states();
-test_module_reshape();
 test_save_load();
-test_executor_group();
-test_module_set_params();
-test_forward_reshape();
-test_module_initializer();
-test_factorization_machine_module();
 test_forward_acceptable_input();
diff --git a/perl-package/AI-MXNet/t/test_multi_device_exec.t b/perl-package/AI-MXNet/t/test_multi_device_exec.t
deleted file mode 100644
index 1b37e6ee9981..000000000000
--- a/perl-package/AI-MXNet/t/test_multi_device_exec.t
+++ /dev/null
@@ -1,74 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-use strict;
-use warnings;
-use Test::More tests => 10;
-use AI::MXNet qw(mx);
-use AI::MXNet::Base;
-
-sub test_ctx_group
-{
-    my ($data, $fc1, $act1);
-    {
-        local($mx::AttrScope) = mx->AttrScope(ctx_group=>'stage1');
-        $data = mx->symbol->Variable('data');
-        $fc1  = mx->symbol->FullyConnected(data => $data, name=>'fc1', num_hidden=>128);
-        $act1 = mx->symbol->Activation(data => $fc1, name=>'relu1', act_type=>"relu");
-    }
-    my %set_stage1 = map { $_ => 1 } @{ $act1->list_arguments };
-
-    my ($fc2, $act2, $fc3, $mlp);
-    {
-        local($mx::AttrScope) = mx->AttrScope(ctx_group=>'stage2');
-        $fc2  = mx->symbol->FullyConnected(data => $act1, name => 'fc2', num_hidden => 64);
-        $act2 = mx->symbol->Activation(data => $fc2, name=>'relu2', act_type=>"relu");
-        $fc3  = mx->symbol->FullyConnected(data => $act2, name=>'fc3', num_hidden=>10);
-        $fc3  = mx->symbol->BatchNorm($fc3);
-        $mlp  = mx->symbol->SoftmaxOutput(data => $fc3, name => 'softmax');
-    }
-    my %set_stage2 = map { $_ => 1 } @{ $mlp->list_arguments };
-    for my $k (keys %set_stage1)
-    {
-        delete $set_stage2{$k};
-    }
-
-    my $group2ctx = {
-        stage1 => mx->cpu(1),
-        stage2 => mx->cpu(2)
-    };
-
-    my $texec = $mlp->simple_bind(
-        ctx       => mx->cpu(0),
-        group2ctx => $group2ctx,
-        shapes    => { data => [1,200] }
-    );
-
-    for(zip($texec->arg_arrays, $mlp->list_arguments())) {
-        my ($arr, $name) = @$_;
-        if(exists $set_stage1{ $name })
-        {
-            cmp_ok($arr->context, '==', $group2ctx->{stage1});
-        }
-        else
-        {
-            cmp_ok($arr->context, '==', $group2ctx->{stage2});
-        }
-    }
-}
-
-test_ctx_group();
diff --git a/perl-package/AI-MXNet/t/test_symbol.t b/perl-package/AI-MXNet/t/test_symbol.t
index 09bab2f96947..713480df053c 100644
--- a/perl-package/AI-MXNet/t/test_symbol.t
+++ b/perl-package/AI-MXNet/t/test_symbol.t
@@ -17,9 +17,9 @@
 
 use strict;
 use warnings;
-use Test::More tests => 103;
+use Test::More tests => 31;
 use AI::MXNet qw(mx);
-use AI::MXNet::TestUtils qw(mlp2 conv check_consistency zip assert enumerate almost_equal same);
+use AI::MXNet::TestUtils qw(mlp2 check_consistency zip assert enumerate almost_equal same);
 use Storable qw(freeze thaw);
 use PDL;
 
@@ -89,7 +89,7 @@ test_symbol_children();
 
 sub test_symbol_storable
 {
-    my $mlist = [mlp2(), conv()];
+    my $mlist = [mlp2()];
     my $data = freeze($mlist);
     my $mlist2 = thaw($data);
     zip(sub {
@@ -113,20 +113,6 @@ sub test_symbol_saveload
 
 test_symbol_saveload();
 
-sub test_symbol_infer_type
-{
-    my $data = mx->symbol->Variable('data');
-    my $f32data = mx->symbol->Cast(data=>$data, dtype=>'float32');
-    my $fc1 = mx->symbol->FullyConnected(data => $f32data, name=>'fc1', num_hidden=>128);
-    my $mlp = mx->symbol->SoftmaxOutput(data => $fc1, name => 'softmax');
-
-    my ($arg, $out, $aux) = $mlp->infer_type(data=>'float16');
-    is_deeply($arg, [qw/float16 float32 float32 float32/]);
-    is_deeply($out, ['float32']);
-    is_deeply($aux, []);
-}
-
-test_symbol_infer_type();
 
 sub test_symbol_infer_shape
 {
@@ -193,50 +179,6 @@ sub check_symbol_consistency
     check_consistency(sym => [$sym1, $sym2], ctx_list => [$ctx, $ctx]);
 }
 
-sub test_load_000800
-{
-    my ($data, $weight, $fc1, $act1);
-    {
-        local($mx::AttrScope) = mx->AttrScope(ctx_group=>'stage1');
-        $data = mx->symbol->Variable('data', lr_mult=>0.2);
-        $weight = mx->sym->Variable('fc1_weight', lr_mult=>1.2);
-        $fc1  = mx->symbol->FullyConnected(data => $data, weight=>$weight, name=>'fc1', num_hidden=>128, wd_mult=>0.3);
-        $act1 = mx->symbol->Activation(data => $fc1, name=>'relu1', act_type=>"relu");
-    }
-    my ($fc2, $act2, $fc3, $sym1);
-    {
-        local($mx::AttrScope) = mx->AttrScope(ctx_group=>'stage2');
-        $fc2  = mx->symbol->FullyConnected(data => $act1, name => 'fc2', num_hidden => 64, lr_mult=>0.01);
-        $act2 = mx->symbol->Activation(data => $fc2, name=>'relu2', act_type=>"relu");
-        $fc3  = mx->symbol->FullyConnected(data => $act2, name=>'fc3', num_hidden=>10);
-        $fc3  = mx->symbol->BatchNorm($fc3, name=>'batchnorm0');
-        $sym1 = mx->symbol->SoftmaxOutput(data => $fc3, name => 'softmax')
-    }
-    { local $/ = undef; my $json = <DATA>; open(F, ">save_000800.json"); print F $json; close(F); };
-    my $sym2 = mx->sym->load('save_000800.json');
-    unlink 'save_000800.json';
-
-    my %attr1 = %{ $sym1->attr_dict };
-    my %attr2 = %{ $sym2->attr_dict };
-    while(my ($k, $v1) = each %attr1)
-    {
-        ok(exists $attr2{ $k });
-        my $v2 = $attr2{$k};
-        while(my ($kk, $vv1) = each %{ $v1 })
-        {
-            if($kk =~ /^__/ and $kk =~ /__$/)
-            {
-                ok(exists $v2->{$kk} and $v2->{$kk} eq $vv1);
-            }
-        }
-    }
-
-    check_symbol_consistency($sym1, $sym2,
-        {ctx => mx->cpu(0), group2ctx =>{stage1 => mx->cpu(1), stage2 => mx->cpu(2) }, shapes => { data => [1,200] }}
-    );
-}
-
-test_load_000800();
 
 sub test_linalg_gemm2
 {
diff --git a/plugin/caffe/README.md b/plugin/caffe/README.md
index a497176a996f..7e60f2e83564 100644
--- a/plugin/caffe/README.md
+++ b/plugin/caffe/README.md
@@ -43,24 +43,16 @@ For example, the following code shows multi-layer perception network for classif
 ### Python
 ```Python
 data = mx.symbol.Variable('data')
+label = mx.symbol.Variable('softmax_label')
 fc1  = mx.symbol.CaffeOp(data_0=data, num_weight=2, name='fc1', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }")
 act1 = mx.symbol.CaffeOp(data_0=fc1, prototxt="layer{type:\"TanH\"}")
 fc2  = mx.symbol.CaffeOp(data_0=act1, num_weight=2, name='fc2', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 64} }")
 act2 = mx.symbol.CaffeOp(data_0=fc2, prototxt="layer{type:\"TanH\"}")
 fc3 = mx.symbol.CaffeOp(data_0=act2, num_weight=2, name='fc3', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 10}}")
-mlp = mx.symbol.SoftmaxOutput(data=fc3, name='softmax')
+mlp = mx.symbol.CaffeLoss(data=fc3, label=label, grad_scale=1, name='softmax', prototxt="layer{type:\"SoftmaxWithLoss\"}")
 ```
 
 Let's break it down. First `data = mx.symbol.Variable('data')` defines a variable as placeholder for input.
 Then it's fed through Caffe operators with `fc1  = mx.symbol.CaffeOp(data_0=data, num_weight=2, name='fc1', prototxt="layer{type:\"InnerProduct\" inner_product_param{num_output: 128} }")`.
 
 The inputs to caffe op are named as data_i for i=0 ... num_data-1 as `num_data` is the number of inputs. You may skip the argument, as the example does, if its value is 1. While `num_weight` is number of `blobs_`(weights). Its default value is 0, as many ops maintain no weight. `prototxt` is the configuration string.
-
-We could also replace the last line by:
-
-```Python
-label = mx.symbol.Variable('softmax_label')
-mlp = mx.symbol.CaffeLoss(data=fc3, label=label, grad_scale=1, name='softmax', prototxt="layer{type:\"SoftmaxWithLoss\"}")
-```
-
-to use loss function in caffe.
diff --git a/python/mxnet/contrib/amp/lists/symbol_bf16.py b/python/mxnet/contrib/amp/lists/symbol_bf16.py
index 86edfe6fde8d..da01e6198be5 100644
--- a/python/mxnet/contrib/amp/lists/symbol_bf16.py
+++ b/python/mxnet/contrib/amp/lists/symbol_bf16.py
@@ -478,7 +478,6 @@
     'topk',
 
     # Neural network
-    'SoftmaxOutput',
     'softmax',
     'Softmax',
     'log_softmax',
@@ -487,13 +486,6 @@
     'GroupNorm',
     'L2Normalization',
     'SoftmaxActivation',
-    'LinearRegressionOutput',
-    'LogisticRegressionOutput',
-    'MAERegressionOutput',
-    '_sparse_LinearRegressionOutput',
-    '_sparse_LogisticRegressionOutput',
-    '_sparse_MAERegressionOutput',
-    'SVMOutput',
     'softmax_cross_entropy',
     'smooth_l1',
     'MakeLoss',
@@ -630,8 +622,4 @@
     ]
 
 LOSS_OUTPUT_FUNCTIONS = [
-    'SoftmaxOutput',
-    'LinearRegressionOutput',
-    'LogisticRegressionOutput',
-    'MAERegressionOutput',
     ]
diff --git a/python/mxnet/contrib/amp/lists/symbol_fp16.py b/python/mxnet/contrib/amp/lists/symbol_fp16.py
index d501a7d6c5b5..ae812fbac2d7 100644
--- a/python/mxnet/contrib/amp/lists/symbol_fp16.py
+++ b/python/mxnet/contrib/amp/lists/symbol_fp16.py
@@ -472,7 +472,6 @@
     'topk',
 
     # Neural network
-    'SoftmaxOutput',
     'softmax',
     'Softmax',
     'log_softmax',
@@ -482,13 +481,6 @@
     'L2Normalization',
     'LRN',
     'SoftmaxActivation',
-    'LinearRegressionOutput',
-    'LogisticRegressionOutput',
-    'MAERegressionOutput',
-    '_sparse_LinearRegressionOutput',
-    '_sparse_LogisticRegressionOutput',
-    '_sparse_MAERegressionOutput',
-    'SVMOutput',
     'softmax_cross_entropy',
     'smooth_l1',
     'MakeLoss',
@@ -629,8 +621,4 @@
     ]
 
 LOSS_OUTPUT_FUNCTIONS = [
-    'SoftmaxOutput',
-    'LinearRegressionOutput',
-    'LogisticRegressionOutput',
-    'MAERegressionOutput',
     ]
diff --git a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
index 247b39301b78..ebb58d972b8b 100644
--- a/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
+++ b/python/mxnet/contrib/onnx/mx2onnx/_op_translations.py
@@ -907,44 +907,6 @@ def convert_softmax(node, **kwargs):
     return [softmax_node]
 
 
-# There's also mx.sym.softmax(), which doesn't do cross-entropy loss,
-# just softmax for inference - hence the name convert_softmax_output.
-@mx_op.register("SoftmaxOutput")
-def convert_softmax_output(node, **kwargs):
-    """Map MXNet's SoftmaxOutput operator attributes to onnx's Softmax operator
-    and return the created node.
-    """
-    name = node["name"]
-
-    input1_idx = kwargs["index_lookup"][node["inputs"][0][0]]
-    input1 = kwargs["proc_nodes"][input1_idx]
-
-    softmax_node = onnx.helper.make_node(
-        "Softmax",
-        [input1.name],
-        [name],
-        axis=1,
-        name=name
-    )
-
-    return [softmax_node]
-
-@mx_op.register("LogisticRegressionOutput")
-def convert_logistic_regression_output(node, **kwargs):
-    """Map MXNet's SoftmaxOutput operator attributes to onnx's Softmax operator
-    and return the created node.
-    """
-    name = node["name"]
-    input1_idx = kwargs["index_lookup"][node["inputs"][0][0]]
-    input1 = kwargs["proc_nodes"][input1_idx]
-    sigmoid_node = onnx.helper.make_node(
-        "Sigmoid",
-        [input1.name],
-        [name],
-        name=name
-    )
-    return [sigmoid_node]
-
 @mx_op.register("BlockGrad")
 def convert_blockgrad(node, **kwargs):
     """ Skip operator  """
diff --git a/python/mxnet/executor.py b/python/mxnet/executor.py
index 25b5d85027bc..b42cfa8dd64a 100644
--- a/python/mxnet/executor.py
+++ b/python/mxnet/executor.py
@@ -161,56 +161,6 @@ def backward(self, out_grads=None, is_train=True):
             cases you want to call backward with is_train=False to get gradient
             during inference.
 
-
-        Examples
-        --------
-        >>> # Example for binding on loss function symbol, which gives the loss value of the model.
-        >>> # Equivalently it gives the head gradient for backward pass.
-        >>> # In this example the built-in SoftmaxOutput is used as loss function.
-        >>> # MakeLoss can be used to define customized loss function symbol.
-        >>> net = mx.sym.Variable('data')
-        >>> net = mx.sym.FullyConnected(net, name='fc', num_hidden=6)
-        >>> net = mx.sym.Activation(net, name='relu', act_type="relu")
-        >>> net = mx.sym.SoftmaxOutput(net, name='softmax')
-
-        >>> args =  {'data': mx.nd.ones((1, 4)), 'fc_weight': mx.nd.ones((6, 4)),
-        >>>          'fc_bias': mx.nd.array((1, 4, 4, 4, 5, 6)), 'softmax_label': mx.nd.ones((1))}
-        >>> args_grad = {'fc_weight': mx.nd.zeros((6, 4)), 'fc_bias': mx.nd.zeros((6))}
-        >>> texec = net.bind(ctx=mx.cpu(), args=args, args_grad=args_grad)
-        >>> out = texec.forward(is_train=True)[0].copy()
-        >>> print out.asnumpy()
-        [[ 0.00378404  0.07600445  0.07600445  0.07600445  0.20660152  0.5616011 ]]
-        >>> texec.backward()
-        >>> print(texec.grad_arrays[1].asnumpy())
-        [[ 0.00378404  0.00378404  0.00378404  0.00378404]
-         [-0.92399555 -0.92399555 -0.92399555 -0.92399555]
-         [ 0.07600445  0.07600445  0.07600445  0.07600445]
-         [ 0.07600445  0.07600445  0.07600445  0.07600445]
-         [ 0.20660152  0.20660152  0.20660152  0.20660152]
-         [ 0.5616011   0.5616011   0.5616011   0.5616011 ]]
-        >>>
-        >>> # Example for binding on non-loss function symbol.
-        >>> # Here the binding symbol is neither built-in loss function
-        >>> # nor customized loss created by MakeLoss.
-        >>> # As a result the head gradient is not automatically provided.
-        >>> a = mx.sym.Variable('a')
-        >>> b = mx.sym.Variable('b')
-        >>> # c is not a loss function symbol
-        >>> c = 2 * a + b
-        >>> args = {'a': mx.nd.array([1,2]), 'b':mx.nd.array([2,3])}
-        >>> args_grad = {'a': mx.nd.zeros((2)), 'b': mx.nd.zeros((2))}
-        >>> texec = c.bind(ctx=mx.cpu(), args=args, args_grad=args_grad)
-        >>> out = texec.forward(is_train=True)[0].copy()
-        >>> print(out.asnumpy())
-        [ 4.  7.]
-        >>> # out_grads is the head gradient in backward pass.
-        >>> # Here we define 'c' as loss function.
-        >>> # Then 'out' is passed as head gradient of backward pass.
-        >>> texec.backward(out)
-        >>> print(texec.grad_arrays[0].asnumpy())
-        [ 8.  14.]
-        >>> print(texec.grad_arrays[1].asnumpy())
-        [ 4.  7.]
         """
         if out_grads is None:
             out_grads = []
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index d672bef4ff6b..86f36877dbf8 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -1674,71 +1674,6 @@ def download(url, fname=None, dirname=None, overwrite=False, retries=5):
     logging.info("downloaded %s into %s successfully", url, fname)
     return fname
 
-def download_model(model_name, dst_dir='./', meta_info=None):
-    """Download a model from data.mxnet.io
-
-    Parameters
-    ----------
-    model_name : str
-        Model name to download
-    dst_dir : str
-        Destination Directory to download the model
-    meta_info : dict of dict
-        Mapping from model_name to dict of the following structure:
-        {'symbol': url, 'params': url}
-
-    Returns
-    -------
-    Two element tuple containing model_name and epoch for the params saved
-    """
-    _base_model_url = 'http://data.mxnet.io/models/'
-    _default_model_info = {
-        'imagenet1k-inception-bn': {'symbol':_base_model_url+'imagenet/inception-bn/Inception-BN-symbol.json',
-                                    'params':_base_model_url+'imagenet/inception-bn/Inception-BN-0126.params'},
-        'imagenet1k-resnet-18': {'symbol':_base_model_url+'imagenet/resnet/18-layers/resnet-18-symbol.json',
-                                 'params':_base_model_url+'imagenet/resnet/18-layers/resnet-18-0000.params'},
-        'imagenet1k-resnet-34': {'symbol':_base_model_url+'imagenet/resnet/34-layers/resnet-34-symbol.json',
-                                 'params':_base_model_url+'imagenet/resnet/34-layers/resnet-34-0000.params'},
-        'imagenet1k-resnet-50': {'symbol':_base_model_url+'imagenet/resnet/50-layers/resnet-50-symbol.json',
-                                 'params':_base_model_url+'imagenet/resnet/50-layers/resnet-50-0000.params'},
-        'imagenet1k-resnet-101': {'symbol':_base_model_url+'imagenet/resnet/101-layers/resnet-101-symbol.json',
-                                  'params':_base_model_url+'imagenet/resnet/101-layers/resnet-101-0000.params'},
-        'imagenet1k-resnet-152': {'symbol':_base_model_url+'imagenet/resnet/152-layers/resnet-152-symbol.json',
-                                  'params':_base_model_url+'imagenet/resnet/152-layers/resnet-152-0000.params'},
-        'imagenet1k-resnext-50': {'symbol':_base_model_url+'imagenet/resnext/50-layers/resnext-50-symbol.json',
-                                  'params':_base_model_url+'imagenet/resnext/50-layers/resnext-50-0000.params'},
-        'imagenet1k-resnext-101': {'symbol':_base_model_url+'imagenet/resnext/101-layers/resnext-101-symbol.json',
-                                   'params':_base_model_url+'imagenet/resnext/101-layers/resnext-101-0000.params'},
-        'imagenet1k-resnext-101-64x4d':
-            {'symbol':_base_model_url+'imagenet/resnext/101-layers/resnext-101-64x4d-symbol.json',
-             'params':_base_model_url+'imagenet/resnext/101-layers/resnext-101-64x4d-0000.params'},
-        'imagenet11k-resnet-152':
-            {'symbol':_base_model_url+'imagenet-11k/resnet-152/resnet-152-symbol.json',
-             'params':_base_model_url+'imagenet-11k/resnet-152/resnet-152-0000.params'},
-        'imagenet11k-place365ch-resnet-152':
-            {'symbol':_base_model_url+'imagenet-11k-place365-ch/resnet-152-symbol.json',
-             'params':_base_model_url+'imagenet-11k-place365-ch/resnet-152-0000.params'},
-        'imagenet11k-place365ch-resnet-50':
-            {'symbol':_base_model_url+'imagenet-11k-place365-ch/resnet-50-symbol.json',
-             'params':_base_model_url+'imagenet-11k-place365-ch/resnet-50-0000.params'},
-    }
-
-
-    if meta_info is None:
-        meta_info = _default_model_info
-    meta_info = dict(meta_info)
-    if model_name not in meta_info:
-        return (None, 0)
-    if not os.path.isdir(dst_dir):
-        os.mkdir(dst_dir)
-    meta = dict(meta_info[model_name])
-    assert 'symbol' in meta, "missing symbol url"
-    model_name = os.path.join(dst_dir, model_name)
-    mx.test_utils.download(meta['symbol'], model_name+'-symbol.json')
-    assert 'params' in meta, "mssing parameter file url"
-    mx.test_utils.download(meta['params'], model_name+'-0000.params')
-    return (model_name, 0)
-
 
 def get_mnist(path='data'):
     """Download and load the MNIST dataset
diff --git a/python/mxnet/visualization.py b/python/mxnet/visualization.py
index 346140da8e31..5b5d3275d887 100644
--- a/python/mxnet/visualization.py
+++ b/python/mxnet/visualization.py
@@ -250,7 +250,6 @@ def plot_network(symbol, title="plot", save_format='pdf', shape=None, dtype=None
     >>> net = mx.sym.FullyConnected(data=net, name='fc1', num_hidden=128)
     >>> net = mx.sym.Activation(data=net, name='relu1', act_type="relu")
     >>> net = mx.sym.FullyConnected(data=net, name='fc2', num_hidden=10)
-    >>> net = mx.sym.SoftmaxOutput(data=net, name='out')
     >>> digraph = mx.viz.plot_network(net, shape={'data':(100,200)},
     ... node_attrs={"fixedsize":"false"})
     >>> digraph.view()
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/ModuleSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/ModuleSuite.scala
index 4d95f43751ba..402509e8ea9a 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/ModuleSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/ModuleSuite.scala
@@ -209,208 +209,4 @@ class ModuleSuite extends FunSuite with BeforeAndAfterAll {
     assert(mod.getOutputsMerged()(0).shape == dShape)
     assert(mod.getParams._1("fc_bias").toArray.forall(x => (x - -3f) < 1e-3))
   }
-
-  test ("module setParams") {
-    val data = NDArray.array(Array(0.05f, 0.1f), Shape(1, 1, 1, 2))
-    val label = NDArray.array(Array(0.01f, 0.99f), Shape(1, 1, 1, 2))
-    val trainData = new NDArrayIter(
-      IndexedSeq(data), IndexedSeq(label), labelName = "softmax_label")
-
-    // symbols
-    var x = Symbol.Variable("data")
-    x = Symbol.FullyConnected(name = "fc_0")()(Map("data" -> x, "num_hidden" -> 2))
-    x = Symbol.Activation(name = "act_0")()(Map("data" -> x, "act_type" -> "sigmoid"))
-    x = Symbol.FullyConnected(name = "fc_1")()(Map("data" -> x, "num_hidden" -> 2))
-    x = Symbol.Activation(name = "act_1")()(Map("data" -> x, "act_type" -> "sigmoid"))
-    x = Symbol.LinearRegressionOutput(name = "softmax")()(Map("data" -> x, "grad_scale" -> 2))
-
-    // create module
-    val mod = new Module(x, contexts = Array(Context.cpu()))
-    mod.bind(dataShapes = trainData.provideDataDesc,
-      Option(trainData.provideLabelDesc))
-    val argParamsCorrect = Map(
-      "fc_0_weight" -> NDArray.array(Array(0.15f, 0.2f, 0.25f, 0.3f), Shape(2, 2)),
-      "fc_0_bias" -> NDArray.array(Array(0.35f, 0.35f), Shape(2)),
-      "fc_1_weight" -> NDArray.array(Array(0.4f, 0.45f, 0.5f, 0.55f), Shape(2, 2)),
-      "fc_1_bias" -> NDArray.array(Array(0.6f, 0.6f), Shape(2))
-    )
-    val argParamsMissing = Map(
-      "fc_0_weight" -> NDArray.array(Array(0.15f, 0.2f, 0.25f, 0.3f), Shape(2, 2)),
-      "fc_0_bias" -> NDArray.array(Array(0.35f, 0.35f), Shape(2)),
-      "fc_1_weight" -> NDArray.array(Array(0.4f, 0.45f, 0.5f, 0.55f), Shape(2, 2))
-    )
-    val argParamsExtra = Map(
-      "fc_0_weight" -> NDArray.array(Array(0.15f, 0.2f, 0.25f, 0.3f), Shape(2, 2)),
-      "fc_0_bias" -> NDArray.array(Array(0.35f, 0.35f), Shape(2)),
-      "fc_1_weight" -> NDArray.array(Array(0.4f, 0.45f, 0.5f, 0.55f), Shape(2, 2)),
-      "fc_1_bias" -> NDArray.array(Array(0.6f, 0.6f), Shape(2)),
-      "fc_2_weight" -> NDArray.array(Array(0.6f, 0.6f), Shape(2))
-    )
-
-    mod.setParams(forceInit = true, argParams = argParamsCorrect,
-      auxParams = null)
-
-    // test allow missing
-    mod.setParams(forceInit = true, argParams = argParamsMissing,
-      auxParams = null, allowMissing = true)
-
-    // test allow extra
-    mod.setParams(forceInit = true, argParams = argParamsExtra, auxParams = null,
-      allowMissing = true, allowExtra = true)
-  }
-
-  test ("monitor") {
-    // data iter
-    val data = NDArray.array(Array(0.05f, 0.1f), Shape(1, 1, 1, 2))
-    val label = NDArray.array(Array(0.01f, 0.99f), Shape(1, 1, 1, 2))
-    val trainData = new NDArrayIter(
-      IndexedSeq(data), IndexedSeq(label), labelName = "softmax_label")
-
-    // symbols
-    var x = Symbol.Variable("data")
-    x = Symbol.FullyConnected(name = "fc_0")()(Map("data" -> x, "num_hidden" -> 2))
-    x = Symbol.Activation(name = "act_0")()(Map("data" -> x, "act_type" -> "sigmoid"))
-    x = Symbol.FullyConnected(name = "fc_1")()(Map("data" -> x, "num_hidden" -> 2))
-    x = Symbol.Activation(name = "act_1")()(Map("data" -> x, "act_type" -> "sigmoid"))
-    x = Symbol.LinearRegressionOutput(name = "softmax")()(Map("data" -> x, "grad_scale" -> 2))
-
-    // create monitor
-    def meanAbs(x: NDArray): NDArray = {
-      val sumAbs = NDArray.sum(NDArray.abs(x))
-      sumAbs / x.shape.product
-    }
-    val mon = new Monitor(1, statFunc = meanAbs)
-
-    // create module
-    val mod = new Module(x, contexts = Array(Context.cpu()))
-    mod.bind(dataShapes = trainData.provideDataDesc,
-      Option(trainData.provideLabelDesc))
-    mod.installMonitor(mon)
-    val argParams = Map(
-      "fc_0_weight" -> NDArray.array(Array(0.15f, 0.2f, 0.25f, 0.3f), Shape(2, 2)),
-      "fc_0_bias" -> NDArray.array(Array(0.35f, 0.35f), Shape(2)),
-      "fc_1_weight" -> NDArray.array(Array(0.4f, 0.45f, 0.5f, 0.55f), Shape(2, 2)),
-      "fc_1_bias" -> NDArray.array(Array(0.6f, 0.6f), Shape(2))
-    )
-    mod.initParams(argParams = argParams)
-
-    val dataBatch = trainData.next()
-    mon.tic()
-    mod.forwardBackward(dataBatch)
-    val res = mon.toc()
-    val keys = Array("act_0", "act_1", "data", "fc_0", "fc_1", "softmax")
-    val monResultCounts = Array(0, 0, 0, 0, 0, 0)
-    assert(res.length == 21)
-    for ((n, k, v) <- res) {
-      var break = false
-      for ((key, idx) <- keys.zipWithIndex) {
-        if (!break && k.startsWith(key)) {
-          monResultCounts(idx) += 1
-          break = true
-        }
-      }
-    }
-    assert(monResultCounts.zip(Array(2, 2, 1, 6, 6, 4)).forall(x => x._1 == x._2))
-  }
-
-  test ("forward reshape") {
-    val numClass = 10
-    val data1 = Symbol.Variable("data1")
-    val data2 = Symbol.Variable("data2")
-    val conv1 = Symbol.Convolution()()(Map("data" -> data1,
-        "kernel" -> "(2, 2)", "num_filter" -> 2, "stride" -> "(2, 2)"))
-    val conv2 = Symbol.Convolution()()(Map("data" -> data2,
-        "kernel" -> "(3, 3)", "num_filter" -> 3, "stride" -> "(1, 1)"))
-    val pooling1 = Symbol.Pooling()()(Map("data" -> conv1,
-        "kernel" -> "(2, 2)", "pool_type" -> "avg", "stride" -> "(1, 1)"))
-    val pooling2 = Symbol.Pooling()()(Map("data" -> conv2,
-        "kernel" -> "(2, 2)", "pool_type" -> "max", "stride" -> "(1, 1)"))
-    val flatten1 = Symbol.flatten()()(Map("data" -> pooling1))
-    val flatten2 = Symbol.flatten()()(Map("data" -> pooling2))
-    val sum = Symbol.sum()()(Map("data" -> flatten1, "axis" -> 1)) +
-      Symbol.sum()()(Map("data" -> flatten2, "axis" -> 1))
-    val fc = Symbol.FullyConnected()()(
-      Map("data" -> sum, "num_hidden" -> numClass))
-    val sym = Symbol.SoftmaxOutput(name = "softmax")()(Map("data" -> fc))
-
-    var dShape1 = Shape(10, 3, 64, 64)
-    var dShape2 = Shape(10, 3, 32, 32)
-    var lShape = Shape(10)
-
-    val mod = new Module(sym, IndexedSeq("data1", "data2"))
-    mod.bind(dataShapes = IndexedSeq(
-      DataDesc("data1", dShape1), DataDesc("data2", dShape2, layout = "NCHW")),
-      labelShapes = Option(IndexedSeq(DataDesc("softmax_label", lShape, layout = "N")))
-    )
-    mod.initParams()
-    mod.initOptimizer(optimizer = new SGD(learningRate = 0.01f))
-
-    // Train with original data shapes
-    var dataBatch = new DataBatch(
-      data = IndexedSeq(
-        NDArray.random_uniform(Map("low" -> 0, "high" -> 9, "shape" -> dShape1.toString()))(),
-        NDArray.random_uniform(Map("low" -> 5, "high" -> 15, "shape" -> dShape2.toString()))()),
-      label = IndexedSeq(NDArray.ones(lShape)), index = null, pad = 0)
-    mod.forward(dataBatch)
-    assert(mod.getOutputsMerged()(0).shape == Shape(lShape(0), numClass))
-    mod.backward()
-    mod.update()
-
-    dShape1 = Shape(3, 3, 64, 64)
-    dShape2 = Shape(3, 3, 32, 32)
-    lShape = Shape(3)
-    dataBatch = new DataBatch(
-      data = IndexedSeq(
-        NDArray.random_uniform(Map("low" -> 0, "high" -> 9, "shape" -> dShape1.toString()))(),
-        NDArray.random_uniform(Map("low" -> 5, "high" -> 15, "shape" -> dShape2.toString()))()),
-      label = IndexedSeq(NDArray.ones(lShape)), index = null, pad = 0)
-    mod.forward(dataBatch)
-    assert(mod.getOutputsMerged()(0).shape == Shape(lShape(0), numClass))
-    mod.backward()
-    mod.update()
-
-    dShape1 = Shape(20, 3, 64, 64)
-    dShape2 = Shape(20, 3, 32, 32)
-    lShape = Shape(20)
-    dataBatch = new DataBatch(
-      data = IndexedSeq(
-        NDArray.random_uniform(Map("low" -> 3, "high" -> 5, "shape" -> dShape1.toString()))(),
-        NDArray.random_uniform(Map("low" -> 10, "high" -> 25, "shape" -> dShape2.toString()))()),
-      label = IndexedSeq(NDArray.ones(lShape)), index = null, pad = 0)
-    mod.forward(dataBatch)
-    assert(mod.getOutputsMerged()(0).shape == Shape(lShape(0), numClass))
-    mod.backward()
-    mod.update()
-
-    // Train with both different batch size and data shapes
-    dShape1 = Shape(20, 3, 120, 120)
-    dShape2 = Shape(20, 3, 32, 64)
-    lShape = Shape(20)
-    dataBatch = new DataBatch.Builder()
-      .setData(
-        NDArray.random_uniform(Map("low" -> 0, "high" -> 9, "shape" -> dShape1.toString()))(),
-        NDArray.random_uniform(Map("low" -> 5, "high" -> 15, "shape" -> dShape2.toString()))())
-      .setLabel(NDArray.ones(lShape))
-      .setPad(0)
-      .build()
-    mod.forward(dataBatch)
-    assert(mod.getOutputsMerged()(0).shape == Shape(lShape(0), numClass))
-    mod.backward()
-    mod.update()
-
-    dShape1 = Shape(5, 3, 28, 40)
-    dShape2 = Shape(5, 3, 24, 16)
-    lShape = Shape(5)
-    dataBatch = new DataBatch.Builder()
-      .setData(
-        NDArray.random_uniform(Map("low" -> 0, "high" -> 9, "shape" -> dShape1.toString()))(),
-        NDArray.random_uniform(Map("low" -> 15, "high" -> 25, "shape" -> dShape2.toString()))())
-      .setLabel(NDArray.ones(lShape))
-      .setPad(0)
-      .build()
-    mod.forward(dataBatch)
-    assert(mod.getOutputsMerged()(0).shape == Shape(lShape(0), numClass))
-    mod.backward()
-    mod.update()
-  }
 }
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/OperatorSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/OperatorSuite.scala
index 7c0b009ac8e6..dc11b7bfb9b7 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/OperatorSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/OperatorSuite.scala
@@ -112,41 +112,6 @@ class OperatorSuite extends FunSuite with BeforeAndAfterAll
     }
   }
 
-  private def checkRegression(model: Symbol,
-                              forward: Float => Float,
-                              backward: (Float, Float) => Float) = {
-    val shape = Shape(3, 1)
-    val arrData = Random.uniform(-1, 1, shape)
-    val arrLabel = Random.uniform(0, 1, Shape(shape.head))
-    val arrGrad = NDArray.empty(shape)
-    val exec1 = model.bind(Context.cpu(),
-      args = Array(arrData, arrLabel), argsGrad = Map("data" -> arrGrad))
-    exec1.forward()
-    assert(exec1.outputs(0).shape === shape)
-    val out1 = exec1.outputs(0).toArray
-    val npout = arrData.toArray.map(forward(_))
-    assert(CheckUtils.reldiff(npout, out1) < 1e-6f)
-
-    exec1.backward()
-    // arrData shape: Vector(3, 1)
-    // arrLabel shape: Vector(3)
-    val npoutBack = (npout zip arrLabel.toArray).map { case (data, label) =>
-      backward(data, label)
-    }
-    assert(CheckUtils.reldiff(npoutBack, arrGrad.toArray) < 1e-6f)
-  }
-
-  test("regression") {
-    checkRegression(Symbol.LogisticRegressionOutput()()(
-      Map("data" -> Symbol.Variable("data"), "label" -> Symbol.Variable("label"))),
-      (x: Float) => 1.0f / (1.0f + Math.exp(-x).toFloat),
-      (x: Float, y: Float) => x - y)
-    checkRegression(Symbol.LinearRegressionOutput()()(
-      Map("data" -> Symbol.Variable("data"), "label" -> Symbol.Variable("label"))),
-      (x: Float) => x,
-      (x: Float, y: Float) => x - y)
-  }
-
   // TODO: test softmax
 
   test("swap axes") {
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala
index 415be5122c95..6a0f0ecba089 100644
--- a/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala
+++ b/scala-package/core/src/test/scala/org/apache/mxnet/SymbolSuite.scala
@@ -49,18 +49,6 @@ class SymbolSuite extends FunSuite with BeforeAndAfterAll {
     assert(fc1.listArguments() === oldfc.listArguments())
   }
 
-  test("symbol infer type") {
-    val data = Symbol.Variable("data")
-    val f32data = Symbol.Cast()()(Map("data" -> data, "dtype" -> "float32"))
-    val fc1 = Symbol.FullyConnected(name = "fc1")()(Map("data" -> f32data, "num_hidden" -> 128))
-    val mlp = Symbol.SoftmaxOutput(name = "softmax")()(Map("data" -> fc1))
-
-    val (arg, out, aux) = mlp.inferType(Map("data" -> DType.Float64))
-    assert(arg.toArray === Array(DType.Float64, DType.Float32, DType.Float32, DType.Float32))
-    assert(out.toArray === Array(DType.Float32))
-    assert(aux.isEmpty)
-  }
-
   test("symbol copy") {
     val data = Symbol.Variable("data")
     val data2 = data.clone()
diff --git a/scala-package/core/src/test/scala/org/apache/mxnet/train/ConvSuite.scala b/scala-package/core/src/test/scala/org/apache/mxnet/train/ConvSuite.scala
deleted file mode 100644
index be6b6b983fe2..000000000000
--- a/scala-package/core/src/test/scala/org/apache/mxnet/train/ConvSuite.scala
+++ /dev/null
@@ -1,117 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnet.train
-
-import org.apache.mxnet.optimizer.SGD
-import org.apache.mxnet._
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
-import org.slf4j.LoggerFactory
-
-import scala.collection.mutable.ListBuffer
-import scala.language.postfixOps
-import scala.sys.process._
-
-class ConvSuite extends FunSuite with BeforeAndAfterAll {
-  private val logger = LoggerFactory.getLogger(classOf[ConvSuite])
-
-  private var tu = new TestUtil
-
-  test("train mnist") {
-    CancelTestUtil.assumeStandardDecimalSeparator()
-    // symbol net
-    val batchSize = 100
-
-    val data = Symbol.Variable("data")
-    val conv1 = Symbol.Convolution(name = "conv1")()(Map("data" -> data, "num_filter" -> 32,
-                                                         "kernel" -> (3, 3), "stride" -> (2, 2)))
-    val bn1 = Symbol.BatchNorm(name = "bn1")()(Map("data" -> conv1))
-    val act1 = Symbol.Activation(name = "relu1")()(Map("data" -> bn1, "act_type" -> "relu"))
-    val mp1 = Symbol.Pooling(name = "mp1")()(Map("data" -> act1, "kernel" -> (2, 2),
-                                                 "stride" -> (2, 2), "pool_type" -> "max"))
-
-    val conv2 = Symbol.Convolution(name = "conv2")()(Map("data" -> mp1, "num_filter" -> 32,
-                                                         "kernel" -> (3, 3), "stride" -> (2, 2)))
-    val bn2 = Symbol.BatchNorm(name = "bn2")()(Map("data" -> conv2))
-    val act2 = Symbol.Activation(name = "relu2")()(Map("data" -> bn2, "act_type" -> "relu"))
-    val mp2 = Symbol.Pooling(name = "mp2")()(Map("data" -> act2, "kernel" -> (2, 2),
-                                                 "stride" -> (2, 2), "pool_type" -> "max"))
-
-    val fl = Symbol.Flatten(name = "flatten")()(Map("data" -> mp2))
-    val fc2 = Symbol.FullyConnected(name = "fc2")()(Map("data" -> fl, "num_hidden" -> 10))
-    val softmax = Symbol.SoftmaxOutput(name = "sm")()(Map("data" -> fc2))
-
-    // get data
-    "./scripts/get_mnist_data.sh" !
-    val trainDataIter = IO.MNISTIter(Map(
-      "image" -> tu.dataFile("train-images-idx3-ubyte"),
-      "label" -> tu.dataFile("train-labels-idx1-ubyte"),
-      "data_shape" -> "(1, 28, 28)",
-      "label_name" -> "sm_label",
-      "batch_size" -> batchSize.toString,
-      "shuffle" -> "1",
-      "flat" -> "0",
-      "silent" -> "0",
-      "seed" -> "10"))
-
-    val valDataIter = IO.MNISTIter(Map(
-      "image" -> tu.dataFile("t10k-images-idx3-ubyte"),
-      "label" -> tu.dataFile("t10k-labels-idx1-ubyte"),
-      "data_shape" -> "(1, 28, 28)",
-      "label_name" -> "sm_label",
-      "batch_size" -> batchSize.toString,
-      "shuffle" -> "1",
-      "flat" -> "0", "silent" -> "0"))
-
-    val model = FeedForward.newBuilder(softmax)
-          .setContext(Context.cpu())
-          .setNumEpoch(1)
-          .setOptimizer(new SGD(learningRate = 0.1f, momentum = 0.9f, wd = 0.0001f))
-          .setTrainData(trainDataIter)
-          .setEvalData(valDataIter)
-          .build()
-    logger.info("Finish fit ...")
-
-    val probArrays = model.predict(valDataIter)
-    assert(probArrays.length === 1)
-    val prob = probArrays(0)
-    logger.info("Finish predict ...")
-
-    valDataIter.reset()
-    val labels = ListBuffer.empty[NDArray]
-    while (valDataIter.hasNext) {
-      val evalData = valDataIter.next()
-      labels += evalData.label(0).copy()
-    }
-    val y = NDArray.concatenate(labels)
-
-    val py = NDArray.argmax_channel(prob)
-    assert(y.shape === py.shape)
-
-    var numCorrect = 0
-    var numInst = 0
-    for ((labelElem, predElem) <- y.toArray zip py.toArray) {
-      if (labelElem == predElem) {
-        numCorrect += 1
-      }
-      numInst += 1
-    }
-    val acc = numCorrect.toFloat / numInst
-    logger.info(s"Final accuracy = $acc")
-    assert(acc > 0.92)
-  }
-}
diff --git a/scala-package/examples/scripts/rnn/run_test_charrnn.sh b/scala-package/examples/scripts/rnn/run_test_charrnn.sh
deleted file mode 100644
index df1fac3667db..000000000000
--- a/scala-package/examples/scripts/rnn/run_test_charrnn.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-MXNET_ROOT=$(cd "$(dirname $0)/../../../.."; pwd)
-CLASS_PATH=$MXNET_ROOT/scala-package/assembly/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
-
-# you can get the training data file using the following command
-# curl -O http://data.mxnet.io/data/char_lstm.zip
-# unzip -o char_lstm.zip
-# for example ./datas/obama.txt
-DATA_PATH=$1
-# for example ./models/obama
-MODEL_PREFIX=$2
-# feel free to change the starter sentence
-STARTER_SENTENCE="The joke"
-
-java -Xmx4G -cp $CLASS_PATH \
-	org.apache.mxnetexamples.rnn.TestCharRnn \
-	--data-path $DATA_PATH \
-	--model-prefix $MODEL_PREFIX \
-	--starter-sentence "$STARTER_SENTENCE"
diff --git a/scala-package/examples/scripts/run_cnntextclassification.sh b/scala-package/examples/scripts/run_cnntextclassification.sh
deleted file mode 100644
index 1e9b46540932..000000000000
--- a/scala-package/examples/scripts/run_cnntextclassification.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-MXNET_ROOT=$(cd "$(dirname $0)/../../.."; pwd)
-CLASS_PATH=$MXNET_ROOT/scala-package/assembly/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
-
-# which gpu card to use, -1 means cpu
-GPU=$1
-# the mr dataset path, you should put the pos and neg file in the same folder
-MR_DATASET_PATH=$2
-# the trained word2vec file path, binary or text format
-W2V_FILE_PATH=$3
-# whether the format of the word2vec file is binary,1 means binary, 0 means text
-W2V_FORMAT_BIN=$4
-BATCH_SIZE=$5
-SAVE_MODEL_PATH=$6
-
-java -Xmx8G -cp $CLASS_PATH \
-	org.apache.mxnetexamples.cnntextclassification.CNNTextClassification \
-	--gpu $GPU \
-	--mr-dataset-path $MR_DATASET_PATH \
-	--w2v-file-path $W2V_FILE_PATH \
-	--w2v-format-bin $W2V_FORMAT_BIN \
-	--batch-size $BATCH_SIZE \
-	--save-model-path $SAVE_MODEL_PATH
diff --git a/scala-package/examples/scripts/run_gan_mnist.sh b/scala-package/examples/scripts/run_gan_mnist.sh
deleted file mode 100644
index 539adfdc9ef2..000000000000
--- a/scala-package/examples/scripts/run_gan_mnist.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-MXNET_ROOT=$(cd "$(dirname $0)/../../.."; pwd)
-CLASS_PATH=$MXNET_ROOT/scala-package/assembly/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
-
-# which gpu card to use, -1 means cpu
-GPU=$1
-
-# the mnist data path
-# you can get the mnist data using the script core/scripts/get_mnist_data.sh
-MNIST_DATA_PATH=$2
-
-# the path to save the generated results
-OUTPUT_PATH=$3
-
-java -Xmx4G -cp $CLASS_PATH \
-	org.apache.mxnetexamples.gan.GanMnist \
-	--mnist-data-path $MNIST_DATA_PATH \
-	--gpu $GPU \
-	--output-path $OUTPUT_PATH
diff --git a/scala-package/examples/scripts/run_multitask.sh b/scala-package/examples/scripts/run_multitask.sh
deleted file mode 100644
index abf272fbe4a1..000000000000
--- a/scala-package/examples/scripts/run_multitask.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-MXNET_ROOT=$(cd "$(dirname $0)/../../.."; pwd)
-CLASS_PATH=$MXNET_ROOT/scala-package/assembly/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
-
-# which gpu card to use, -1 means cpu
-GPU=$1
-
-# the mnist data path
-# you can get the mnist data using the script core/scripts/get_mnist_data.sh
-DATA_PATH=$2
-
-java -Xmx4G -cp $CLASS_PATH \
-	org.apache.mxnetexamples.multitask.ExampleMultiTask \
-	--data-path $DATA_PATH \
-	--gpu $GPU \
diff --git a/scala-package/examples/scripts/run_train_mnist.sh b/scala-package/examples/scripts/run_train_mnist.sh
deleted file mode 100755
index 418065e77577..000000000000
--- a/scala-package/examples/scripts/run_train_mnist.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set -e
-
-MXNET_ROOT=$(cd "$(dirname $0)/../../.."; pwd)
-echo $MXNET_ROOT
-CLASS_PATH=$MXNET_ROOT/scala-package/assembly/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
-
-# model dir
-DATA_PATH=$2
-
-java -XX:+PrintGC -Dmxnet.traceLeakedObjects=false -cp $CLASS_PATH \
-        org.apache.mxnetexamples.imclassification.TrainModel \
-        --data-dir $MXNET_ROOT/scala-package/examples/mnist/ \
-        --network mlp \
-        --num-layers 50 \
-        --num-epochs 10000000 \
-        --batch-size 1024
\ No newline at end of file
diff --git a/scala-package/examples/scripts/run_visualization.sh b/scala-package/examples/scripts/run_visualization.sh
deleted file mode 100644
index 7f5b94fbe2b5..000000000000
--- a/scala-package/examples/scripts/run_visualization.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-
-MXNET_ROOT=$(cd "$(dirname $0)/../../.."; pwd)
-CLASS_PATH=$MXNET_ROOT/scala-package/assembly/assembly/target/*:$MXNET_ROOT/scala-package/examples/target/*:$MXNET_ROOT/scala-package/examples/target/classes/lib/*
-
-# please install the Graphviz library
-# if you are using ubuntu, use the following command:
-# sudo apt-get install graphviz
-
-# path to save the generated visualization result
-OUT_DIR=$1
-# net to visualze, e.g. "LeNet", "AlexNet", "VGG", "GoogleNet", "Inception_BN", "Inception_V3", "ResNet_Small"
-NET=$2
-
-java -Xmx1024m -cp $CLASS_PATH \
-	org.apache.mxnetexamples.visualization.ExampleVis \
-	--out-dir $OUT_DIR  \
-	--net $NET
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/benchmark/README.md b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/benchmark/README.md
index 67ee1ef65bab..2b32dd6664bc 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/benchmark/README.md
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/benchmark/README.md
@@ -22,7 +22,6 @@ The benchmarking scripts provided runs an experiment for single inference calls
 Currently the ScalaInferenceBenchmark script supports three Scala examples : 
 1. [ImageClassification using ResNet-152](https://github.com/apache/incubator-mxnet/blob/master/scala-package/mxnet-demo/src/main/scala/sample/ImageClassificationExample.scala)
 2. [Object Detection Example](https://github.com/apache/incubator-mxnet/blob/master/scala-package/examples/src/main/scala/org/apache/mxnetexamples/infer/objectdetector/SSDClassifierExample.scala)
-3. [Text Generation through RNNs](https://github.com/apache/incubator-mxnet/blob/master/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/TestCharRnn.scala)
 
 This script can be easily placed in an automated environment to run benchmark regressions on the Scala APIs. The script automatically picks up whether you are running it on a CPU machine or on a GPU machine and appropriately uses that.
 
@@ -82,19 +81,3 @@ You may need to run ```chmod u+x run_image_inference_bm.sh``` before running thi
     INFO org.apache.mxnetexamples.benchmark.CLIParserBase - 
     batch_inference_latency p99 4241, batch_inference_p50 4241, batch_inference_average 4241.00
     ```
-    
-* *Text Generation through RNNs*
-<br>The following shows an example of running TestCharRnn under the benchmark script. The script takes in the number of iterations for inference calls, the model path and the input text file. 
-For more details to run TestCharRnn as a standalone file, refer to the [README](https://github.com/apache/incubator-mxnet/blob/master/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/README.md) for TextCharRnn.
-You may need to run ```chmod u+x run_text_charrnn_bm.sh``` before running this script.
-    ```bash
-    wget https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/RNN/obama.zip
-    unzip obama.zip
-    cd <Path-To-MXNET-Repo>/scala-package/examples/scripts/benchmark
-    ./run_text_charrnn_bm.sh cpu CharRnn 100 <path-to-model>/obama <path-to-model>/obama.txt 
-    ```
-    Upon running this script, you might see an output like this : 
-    ```
-    [main] INFO org.apache.mxnetexamples.benchmark.CLIParserBase - 
-    single_inference_latency p99 4097, single_inference_p50 2560, single_inference_average 2673.720000 
-    ```
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/benchmark/ScalaInferenceBenchmark.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/benchmark/ScalaInferenceBenchmark.scala
index fde9bdbc0abf..ba6c4b8ca7a9 100644
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/benchmark/ScalaInferenceBenchmark.scala
+++ b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/benchmark/ScalaInferenceBenchmark.scala
@@ -21,7 +21,6 @@ import org.apache.mxnetexamples.InferBase
 import org.apache.mxnetexamples.infer.imageclassifier.ImageClassifierExample
 import org.apache.mxnet._
 import org.apache.mxnetexamples.infer.objectdetector.SSDClassifierExample
-import org.apache.mxnetexamples.rnn.TestCharRnn
 import org.kohsuke.args4j.{CmdLineParser, Option}
 import org.slf4j.LoggerFactory
 
@@ -122,12 +121,6 @@ object ScalaInferenceBenchmark {
           val parsedVals = new CmdLineParser(imParser).parseArgument(args.toList.asJava)
           new SSDClassifierExample(imParser)
         }
-        case "CharRnn" => {
-          val imParser = new org.apache.mxnetexamples.rnn.CLIParser
-          baseCLI = imParser
-          val parsedVals = new CmdLineParser(imParser).parseArgument(args.toList.asJava)
-          new TestCharRnn(imParser)
-        }
         case _ => throw new Exception("Invalid example name to run")
       }
 
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/cnntextclassification/CNNTextClassification.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/cnntextclassification/CNNTextClassification.scala
deleted file mode 100644
index 04cc6e240bc2..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/cnntextclassification/CNNTextClassification.scala
+++ /dev/null
@@ -1,331 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.cnntextclassification
-
-import org.apache.mxnet.optimizer.RMSProp
-import org.apache.mxnet.{Context, Executor, Model, NDArray, Optimizer, ResourceScope, Shape, Symbol, Uniform}
-import org.kohsuke.args4j.{CmdLineParser, Option}
-import org.slf4j.LoggerFactory
-
-import scala.collection.JavaConverters._
-import scala.util.Random
-
-/**
-  * An Implementation of the paper
-  * Convolutional Neural Networks for Sentence Classification
-  */
-object CNNTextClassification {
-
-  private val logger = LoggerFactory.getLogger(classOf[CNNTextClassification])
-
-  case class CNNModel(cnnExec: Executor, symbol: Symbol, data: NDArray, label: NDArray,
-                      argsDict: Map[String, NDArray], gradDict: Map[String, NDArray])
-
-  def makeTextCNN(sentenceSize: Int, numEmbed: Int, batchSize: Int,
-                  numLabel: Int = 2, filterList: Array[Int] = Array(3, 4, 5), numFilter: Int = 100,
-                  dropout: Float = 0.5f): Symbol = {
-
-    val inputX = Symbol.Variable("data")
-    val inputY = Symbol.Variable("softmax_label")
-    val polledOutputs = filterList.map { filterSize =>
-      val conv = Symbol.api.Convolution(data = Some(inputX),
-        kernel = new Shape(filterSize, numEmbed), num_filter = numFilter)
-      val relu = Symbol.api.Activation(data = Some(conv), act_type = "relu")
-      val pool = Symbol.api.Pooling(data = Some(relu), pool_type = Some("max"),
-        kernel = Some(new Shape(sentenceSize - filterSize + 1, 1)), stride = Some(new Shape(1, 1)))
-      relu.dispose()
-      conv.dispose()
-      pool
-    }
-
-    val totalFilters = numFilter * filterList.length
-    // val concat = Symbol.Concat()(polledOutputs: _*)(Map("dim" -> 1))
-    val concat = Symbol.api.concat(data = polledOutputs,
-      num_args = polledOutputs.length, dim = Some(1))
-    val hPool = Symbol.api.reshape(data = Some(concat),
-      target_shape = Some(new Shape(batchSize, totalFilters)))
-
-    val hDrop = {
-      if (dropout > 0f) Symbol.api.Dropout(data = Some(hPool), p = Some(dropout))
-      else hPool
-    }
-
-    val fc = Symbol.api.FullyConnected(data = Some(hDrop), num_hidden = numLabel)
-    val sm = Symbol.api.SoftmaxOutput(data = Some(fc), label = Some(inputY))
-    fc.dispose()
-    hDrop.dispose()
-    hPool.dispose()
-    concat.dispose()
-    polledOutputs.foreach(_.dispose())
-    inputX.dispose()
-    inputY.dispose()
-    sm
-  }
-
-  def setupCnnModel(ctx: Context, batchSize: Int, sentenceSize: Int, numEmbed: Int,
-      numLabel: Int = 2, numFilter: Int = 100, filterList: Array[Int ] = Array(3, 4, 5),
-      dropout: Float = 0.0f): CNNModel = {
-
-    val cnn = makeTextCNN(sentenceSize, numEmbed, batchSize,
-      numLabel, filterList, numFilter, dropout)
-    val argNames = cnn.listArguments()
-    val auxNames = cnn.listAuxiliaryStates()
-
-    val (argShapes, outShapes, auxShapes) = cnn.inferShape(
-      Map("data" -> Shape(batchSize, 1, sentenceSize, numEmbed)))
-    val argsDict = argNames.zip(argShapes.map(NDArray.zeros(_, ctx))).toMap
-    val argsGradDict = argNames.zip(argShapes)
-      .filter(x => x._1 != "softmax_label" && x._1 != "data")
-      .map(x => x._1 -> NDArray.zeros(x._2, ctx)).toMap
-    val auxDict = auxNames.zip(auxShapes.map(NDArray.zeros(_, ctx))).toMap
-    val cnnExec = cnn.bind(ctx, argsDict, argsGradDict, "add", auxDict, null, null)
-
-    val data = argsDict("data")
-    val label = argsDict("softmax_label")
-    CNNModel(cnnExec, cnn, data, label, argsDict, argsGradDict)
-  }
-
-  def trainCNN(model: CNNModel, trainBatches: Array[Array[Array[Float]]],
-               trainLabels: Array[Float], devBatches: Array[Array[Array[Float]]],
-               devLabels: Array[Float], batchSize: Int, saveModelPath: String,
-               learningRate: Float = 0.001f): Float = {
-    val maxGradNorm = 0.5f
-    val epoch = 10
-    val initializer = new Uniform(0.1f)
-    val opt = new RMSProp(learningRate)
-    val updater = Optimizer.getUpdater(opt)
-    var start = 0L
-    var end = 0L
-    var numCorrect = 0f
-    var numTotal = 0f
-    var factor = 0.5f
-    var maxAccuracy = -1f
-    var updateRate = 0
-
-    val paramBlocks = model.symbol.listArguments()
-      .filter(x => x != "data" && x != "softmax_label")
-      .zipWithIndex.map { x =>
-      initializer(x._1, model.gradDict(x._1))
-      val state = opt.createState(x._2, model.argsDict(x._1))
-      (x._2, model.argsDict(x._1), model.gradDict(x._1), state, x._1)
-    }.toArray
-    var devAcc = 0.0f
-    for (iter <- 0 until epoch) {
-      start = System.currentTimeMillis()
-      numCorrect = 0f
-      numTotal = 0f
-      updateRate = 0
-
-      ResourceScope.using() {
-        for (begin <- 0 until trainBatches.length by batchSize) {
-          val (batchD, batchL) = {
-            if (begin + batchSize <= trainBatches.length) {
-              val datas = trainBatches.drop(begin).take(batchSize)
-              val labels = trainLabels.drop(begin).take(batchSize)
-              (datas, labels)
-            } else {
-              val right = (begin + batchSize) - trainBatches.length
-              val left = trainBatches.length - begin
-              val datas = trainBatches.drop(begin).take(left) ++ trainBatches.take(right)
-              val labels = trainLabels.drop(begin).take(left) ++ trainLabels.take(right)
-              (datas, labels)
-            }
-          }
-          numTotal += batchSize
-          model.data.set(batchD.flatten.flatten)
-          model.label.set(batchL)
-
-          model.cnnExec.forward(isTrain = true)
-          model.cnnExec.backward()
-
-          val tmpCorrect = {
-            val predLabel = NDArray.api.argmax_channel(model.cnnExec.outputs(0))
-            val result = predLabel.toArray.zip(batchL).map { predLabel =>
-              if (predLabel._1 == predLabel._2) 1
-              else 0
-            }.sum.toFloat
-            predLabel.dispose()
-            result
-          }
-
-          numCorrect = numCorrect + tmpCorrect
-          val norm = Math.sqrt(paramBlocks.map { case (idx, weight, grad, state, name) =>
-            val temp = NDArray.api.norm(grad / batchSize).disposeDepsExcept(grad)
-            val l2Norm = temp.toScalar
-            temp.dispose()
-            l2Norm * l2Norm
-          }.sum).toFloat
-
-          if (updateRate % 2 == 0) {
-            paramBlocks.foreach { case (idx, weight, grad, state, name) =>
-              if (norm > maxGradNorm) {
-                grad.set(grad.toArray.map(_ * (maxGradNorm / norm)))
-                opt.update(idx, weight, grad, state)
-              }
-              else opt.update(idx, weight, grad, state)
-              grad.set(0f)
-            }
-          }
-          updateRate = updateRate + 1
-        }
-      }
-
-      // decay learning rate
-      if (iter % 50 == 0 && iter > 0) {
-        factor *= 0.5f
-        opt.setLrMult(paramBlocks.map(paramBlock => (Left(paramBlock._1), factor)).toMap)
-        logger.info(s"reset learning to ${opt.learningRate * factor}")
-      }
-      // end of training loop
-      end = System.currentTimeMillis()
-      logger.info(s"Iter $iter Train: Time: ${(end - start) / 1000}," +
-        s"Training Accuracy: ${numCorrect / numTotal * 100}%")
-
-      // eval on dev set
-      numCorrect = 0f
-      numTotal = 0f
-      for (begin <- 0 until devBatches.length by batchSize) {
-        if (begin + batchSize <= devBatches.length) {
-          numTotal += batchSize
-          val (batchD, batchL) = {
-            val datas = devBatches.drop(begin).take(batchSize)
-            val labels = devLabels.drop(begin).take(batchSize)
-            (datas, labels)
-          }
-
-          model.data.set(batchD.flatten.flatten)
-          model.label.set(batchL)
-
-          model.cnnExec.forward(isTrain = false)
-
-          val tmpCorrect = {
-            val predLabel = NDArray.api.argmax_channel(model.cnnExec.outputs(0))
-            val result = predLabel.toArray.zip(batchL).map { predLabel =>
-              if (predLabel._1 == predLabel._2) 1
-              else 0
-            }.sum.toFloat
-            predLabel.dispose()
-            result
-          }
-          numCorrect = numCorrect + tmpCorrect
-        }
-      }
-      devAcc = numCorrect / numTotal
-      logger.info(s"Dev Accuracy so far: ${devAcc * 100}%")
-      if (devAcc > maxAccuracy) {
-        maxAccuracy = devAcc
-        Model.saveCheckpoint(s"$saveModelPath/cnn-text-dev-acc-$maxAccuracy",
-          iter, model.symbol, model.cnnExec.argDict, model.cnnExec.auxDict)
-        logger.info(s"max accuracy on dev so far: ${maxAccuracy  * 100}%")
-      }
-    }
-    devAcc
-  }
-
-  def test(w2vFilePath : String, mrDatasetPath: String,
-           ctx : Context, saveModelPath: String) : Float = {
-    val output = ResourceScope.using() {
-      val (numEmbed, word2vec) = DataHelper.loadGoogleModel(w2vFilePath)
-      val (datas, labels) = DataHelper.loadMSDataWithWord2vec(
-        mrDatasetPath, numEmbed, word2vec)
-      // randomly shuffle data
-      val randIdx = Random.shuffle((0 until datas.length).toList)
-      // split train/dev set
-      val (trainDats, devDatas) = {
-        val train = randIdx.dropRight(1000).map(datas(_)).toArray
-        val dev = randIdx.takeRight(1000).map(datas(_)).toArray
-        (train, dev)
-      }
-      val (trainLabels, devLabels) = {
-        val train = randIdx.dropRight(1000).map(labels(_)).toArray
-        val dev = randIdx.takeRight(1000).map(labels(_)).toArray
-        (train, dev)
-      }
-      // reshpae for convolution input
-      val sentenceSize = datas(0).length
-      val batchSize = 100
-      val lr = 0.001f
-      val cnnModel = setupCnnModel(ctx, batchSize, sentenceSize, numEmbed)
-      val result = trainCNN(cnnModel, trainDats, trainLabels, devDatas, devLabels, batchSize,
-        saveModelPath, learningRate = lr)
-      result
-    }
-    output
-  }
-
-  def main(args: Array[String]): Unit = {
-    val exon = new CNNTextClassification
-    val parser: CmdLineParser = new CmdLineParser(exon)
-    try {
-      parser.parseArgument(args.toList.asJava)
-
-      logger.info("Loading data...")
-      val (numEmbed, word2vec) =
-        if (exon.w2vFormatBin == 1) DataHelper.loadGoogleModel(exon.w2vFilePath)
-        else DataHelper.loadPretrainedWord2vec(exon.w2vFilePath)
-      val (datas, labels) = DataHelper.loadMSDataWithWord2vec(
-        exon.mrDatasetPath, numEmbed, word2vec)
-
-      // randomly shuffle data
-      val randIdx = Random.shuffle((0 until datas.length).toList)
-      // split train/dev set
-      val (trainDats, devDatas) = {
-        val train = randIdx.dropRight(1000).map(datas(_)).toArray
-        val dev = randIdx.takeRight(1000).map(datas(_)).toArray
-        (train, dev)
-      }
-      val (trainLabels, devLabels) = {
-        val train = randIdx.dropRight(1000).map(labels(_)).toArray
-        val dev = randIdx.takeRight(1000).map(labels(_)).toArray
-        (train, dev)
-      }
-
-      // reshpae for convolution input
-      val sentenceSize = datas(0).length
-      val ctx = if (exon.gpu == -1) Context.cpu() else Context.gpu(exon.gpu)
-
-      val cnnModel = setupCnnModel(ctx, exon.batchSize, sentenceSize, numEmbed)
-      trainCNN(cnnModel, trainDats, trainLabels, devDatas, devLabels, exon.batchSize,
-        exon.saveModelPath, learningRate = exon.lr)
-
-    } catch {
-      case ex: Exception => {
-        logger.error(ex.getMessage, ex)
-        parser.printUsage(System.err)
-        sys.exit(1)
-      }
-    }
-  }
-}
-
-class CNNTextClassification {
-  @Option(name = "--lr", usage = "the initial learning rate")
-  private val lr: Float = 0.001f
-  @Option(name = "--batch-size", usage = "the batch size")
-  private val batchSize: Int = 100
-  @Option(name = "--gpu", usage = "which gpu card to use, default is -1, means using cpu")
-  private val gpu: Int = -1
-  @Option(name = "--w2v-format-bin", usage = "does the word2vec file format is binary")
-  private val w2vFormatBin: Int = 0
-  @Option(name = "--mr-dataset-path", usage = "the MR polarity dataset path")
-  private val mrDatasetPath: String = ""
-  @Option(name = "--w2v-file-path", usage = "the word2vec file path")
-  private val w2vFilePath: String = ""
-  @Option(name = "--save-model-path", usage = "the model saving path")
-  private val saveModelPath: String = ""
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/cnntextclassification/DataHelper.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/cnntextclassification/DataHelper.scala
deleted file mode 100644
index 6a128b0b5eb5..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/cnntextclassification/DataHelper.scala
+++ /dev/null
@@ -1,176 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.cnntextclassification
-
-import java.io.{BufferedInputStream, DataInputStream, FileInputStream, InputStream}
-import java.nio.charset.CodingErrorAction
-
-import org.apache.mxnet.{Context, Random, Shape}
-
-import scala.io.{Codec, Source}
-
-object DataHelper {
-
-  def cleanStr(str: String): String = {
-    str.replaceAll("[^A-Za-z0-9(),!?'`]", " ")
-        .replaceAll("'s", " 's")
-        .replaceAll("'ve", " 've")
-        .replaceAll("n't", " n't")
-        .replaceAll("'re", " 're")
-        .replaceAll("'d", " 'd")
-        .replaceAll("'ll", " 'll")
-        .replaceAll(",", " , ")
-        .replaceAll("!", " ! ")
-        .replaceAll("\\(", " \\( ")
-        .replaceAll("\\)", " \\) ")
-        .replaceAll("\\?", " \\? ")
-        .replaceAll(" {2,}", " ")
-        .trim()
-  }
-
-  // Loads MR polarity data from files, splits the data into words and generates labels.
-  // Returns split sentences and labels.
-  def loadMRDataAndLabels(dataPath: String): (Array[Array[String]], Array[Float]) = {
-    // load data from file
-    implicit val codec = Codec("UTF-8")
-    codec.onMalformedInput(CodingErrorAction.REPLACE)
-    codec.onUnmappableCharacter(CodingErrorAction.REPLACE)
-    val positiveExamples = {
-      val lines = Source.fromFile(s"$dataPath/rt-polarity.pos").mkString.split("\n")
-      lines.map(_.trim())
-    }
-    val negativeExamples = {
-      val lines = Source.fromFile(s"$dataPath/rt-polarity.neg").mkString.split("\n")
-      lines.map(_.trim())
-    }
-    // split by words
-    val xText = {
-      val tmp = positiveExamples ++ negativeExamples
-      tmp.map(cleanStr(_)).map(_.split(" "))
-    }
-    // generate labels
-    val positiveLabels = (1 to positiveExamples.length).map(x => 1).toArray
-    val negativeLabels = (1 to negativeExamples.length).map(x => 0).toArray
-    val y = positiveLabels ++ negativeLabels
-    (xText, y.map(_.toFloat))
-  }
-
-  // Pads all sentences to the same length. The length is defined by the longest sentence.
-  // Returns padded sentences.
-  def padSentences(sentences: Array[Array[String]],
-    paddingWord: String = "</s>"): Array[Array[String]] = {
-    val sequenceLength = (-1 /: sentences.map(_.length)){ (max, len) =>
-      if (max < len) len else max
-    }
-    val paddedSetences = sentences.map { sentence =>
-      val numPadding = sequenceLength - sentence.length
-      sentence ++ (1 to numPadding).map(x => paddingWord)
-    }
-    paddedSetences
-  }
-
-  def loadPretrainedWord2vec(inFile: String): (Int, Map[String, Array[Float]]) = {
-    val lines = Source.fromFile(inFile).mkString.mkString.split("\n")
-    val (vocabSize, dim) = {
-      val head = lines(0).split(" ").map(_.toInt)
-      (head(0), head(1))
-    }
-    val word2vec = lines.drop(1).map { line =>
-      val tks = line.trim().split(" ")
-      tks(0) -> tks.drop(1).map(_.toFloat)
-    }.toMap
-    (dim, word2vec)
-  }
-
-  def readString(dis: DataInputStream): String = {
-    val MAX_SIZE = 50
-    var bytes = new Array[Byte](MAX_SIZE)
-    var b = dis.readByte()
-    var i = -1
-    val sb = new StringBuilder()
-    while (b != 32 && b != 10) {
-      i = i + 1
-      bytes(i) = b
-      b = dis.readByte()
-      if (i == 49) {
-        sb.append(new String(bytes))
-        i = -1
-        bytes = new Array[Byte](MAX_SIZE)
-      }
-    }
-    sb.append(new String(bytes, 0, i + 1))
-    sb.toString()
-  }
-
-  def getFloat(b: Array[Byte]): Float = {
-    var accum = 0
-    accum = accum | (b(0) & 0xff) << 0
-    accum = accum | (b(1) & 0xff) << 8
-    accum = accum | (b(2) & 0xff) << 16
-    accum = accum | (b(3) & 0xff) << 24
-    java.lang.Float.intBitsToFloat(accum).toFloat
-  }
-
-  def readFloat(is: InputStream): Float = {
-    val bytes = new Array[Byte](4)
-    is.read(bytes)
-    getFloat(bytes)
-  }
-
-  // Reference https://github.com/NLPchina/Word2VEC_java
-  def loadGoogleModel(path: String): (Int, Map[String, Array[Float]]) = {
-    val bis = new BufferedInputStream(new FileInputStream(path))
-    val dis = new DataInputStream(bis)
-    val wordSize = Integer.parseInt(readString(dis))
-    val dim = Integer.parseInt(readString(dis))
-    var word2vec = Map[String, Array[Float]]()
-    for (i <- 0 until wordSize) {
-      val word = readString(dis)
-      val vectors = (1 to dim).map(j => readFloat(dis)).toArray
-      word2vec += word -> vectors
-    }
-    bis.close()
-    dis.close()
-    (dim, word2vec)
-  }
-
-  // Map sentences and labels to vectors based on a pretrained word2vec.
-  def buildInputDataWithWord2vec(sentences: Array[Array[String]], embeddingSize: Int,
-    word2vec: Map[String, Array[Float]]): Array[Array[Array[Float]]] = {
-    val xVec = sentences.map { sentence =>
-      sentence.map { word =>
-        if (word2vec.contains(word)) word2vec(word)
-        else {
-          val temp = Random.uniform(-0.25f, 0.25f, Shape(embeddingSize), Context.cpu())
-          val result = temp.toArray
-          temp.dispose()
-          result
-        }
-      }
-    }
-    xVec
-  }
-
-  def loadMSDataWithWord2vec(dataPath: String, embeddingSize: Int,
-    word2vec: Map[String, Array[Float]]): (Array[Array[Array[Float]]], Array[Float]) = {
-    // loads the MR dataset
-    val (sentences, labels) = loadMRDataAndLabels(dataPath)
-    val sentencesPadded = padSentences(sentences)
-    (buildInputDataWithWord2vec(sentencesPadded, embeddingSize, word2vec), labels)
-  }
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/cnntextclassification/README.md b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/cnntextclassification/README.md
deleted file mode 100644
index fe4fbef18f29..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/cnntextclassification/README.md
+++ /dev/null
@@ -1,40 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# CNN Text Classification Example for Scala
-This is the example using Scala type-safe api doing CNN text classification. 
-This example is only for Illustration and not modeled to achieve the best accuracy.
-
-Please contribute to improve the dev accuracy of the model.
-
-## Setup
-
-Please configure your maven project using our latest release. An tutorial to do that can be found here:
-[IntelliJ IDE (or alternative IDE) project setup](https://mxnet.apache.org/api/scala/docs/tutorials/mxnet_scala_on_intellij)
-
-### Download the training files
-```$xslt
-https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/CNN/rt-polarity.pos
-https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/CNN/rt-polarity.neg
-```
-### Download pretrained Word2Vec Model
-I used the SLIM version, you can try with the full version to see if the accuracy can improve
-```$xslt
-https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/CNN/GoogleNews-vectors-negative300-SLIM.bin
-```
-### Train the model
-Please configure the [args](https://github.com/apache/incubator-mxnet/blob/scala-package/examples/src/main/scala/org/apache/mxnet/examples/cnntextclassification/CNNTextClassification.scala#L299-L312) required for the model here and then run it.
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/gan/GanMnist.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/gan/GanMnist.scala
deleted file mode 100644
index 8b312c621758..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/gan/GanMnist.scala
+++ /dev/null
@@ -1,213 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.gan
-
-import org.apache.mxnet.{Context, CustomMetric, DataBatch, IO, NDArray, ResourceScope, Shape, Symbol, Xavier}
-import org.apache.mxnet.optimizer.Adam
-import org.kohsuke.args4j.{CmdLineParser, Option}
-import org.slf4j.LoggerFactory
-
-import scala.collection.JavaConverters._
-
-object GanMnist {
-
-  private val logger = LoggerFactory.getLogger(classOf[GanMnist])
-
-  // a deconv layer that enlarges the feature map
-  def deconv2D(data: Symbol, iShape: Shape, oShape: Shape,
-               kShape: (Int, Int), name: String, stride: (Int, Int) = (2, 2)): Symbol = {
-    val targetShape = Shape(oShape(oShape.length - 2), oShape(oShape.length - 1))
-    val net = Symbol.api.Deconvolution(data = Some(data), kernel = Shape(kShape._1, kShape._2),
-      stride = Some(Shape(stride._1, stride._2)), target_shape = Some(targetShape),
-      num_filter = oShape(0), no_bias = Some(true), name = name)
-    net
-  }
-
-  def deconv2DBnRelu(data: Symbol, prefix: String, iShape: Shape,
-                     oShape: Shape, kShape: (Int, Int), eps: Float = 1e-5f + 1e-12f): Symbol = {
-    var net = deconv2D(data, iShape, oShape, kShape, name = s"${prefix}_deconv")
-    net = Symbol.api.BatchNorm(name = s"${prefix}_bn", data = Some(net),
-      fix_gamma = Some(true), eps = Some(eps))
-    net = Symbol.api.Activation(data = Some(net), act_type = "relu", name = s"${prefix}_act")
-    net
-  }
-
-  def deconv2DAct(data: Symbol, prefix: String, actType: String,
-                  iShape: Shape, oShape: Shape, kShape: (Int, Int)): Symbol = {
-    var net = deconv2D(data, iShape, oShape, kShape, name = s"${prefix}_deconv")
-    net = Symbol.api.Activation(data = Some(net), act_type = "relu", name = s"${prefix}_act")
-    net
-  }
-
-  def makeDcganSym(oShape: Shape, ngf: Int = 128, finalAct: String = "sigmoid",
-                   eps: Float = 1e-5f + 1e-12f): (Symbol, Symbol) = {
-
-    val code = Symbol.Variable("rand")
-    var net = Symbol.api.FullyConnected(data = Some(code), num_hidden = 4 * 4 * ngf * 4,
-      no_bias = Some(true), name = " g1")
-    net = Symbol.api.Activation(data = Some(net), act_type = "relu", name = "gact1")
-    // 4 x 4
-    net = Symbol.api.Reshape(data = Some(net), shape = Some(Shape(-1, ngf * 4, 4, 4)))
-    // 8 x 8
-    net = deconv2DBnRelu(net, prefix = "g2",
-      iShape = Shape(ngf * 4, 4, 4), oShape = Shape(ngf * 2, 8, 8), kShape = (3, 3))
-    // 14x14
-    net = deconv2DBnRelu(net, prefix = "g3",
-      iShape = Shape(ngf * 2, 8, 8), oShape = Shape(ngf, 14, 14), kShape = (4, 4))
-    // 28x28
-    val gout = deconv2DAct(net, prefix = "g4", actType = finalAct, iShape = Shape(ngf, 14, 14),
-      oShape = Shape(oShape.toArray.takeRight(3)), kShape = (4, 4))
-
-    val data = Symbol.Variable("data")
-    // 28 x 28
-    val conv1 = Symbol.api.Convolution(data = Some(data), kernel = Shape(5, 5),
-      num_filter = 20, name = "conv1")
-    val tanh1 = Symbol.api.Activation(data = Some(conv1), act_type = "tanh")
-    val pool1 = Symbol.api.Pooling(data = Some(tanh1), pool_type = Some("max"),
-      kernel = Some(Shape(2, 2)), stride = Some(Shape(2, 2)))
-    // second conv
-    val conv2 = Symbol.api.Convolution(data = Some(pool1), kernel = Shape(5, 5),
-      num_filter = 50, name = "conv2")
-    val tanh2 = Symbol.api.Activation(data = Some(conv2), act_type = "tanh")
-    val pool2 = Symbol.api.Pooling(data = Some(tanh2), pool_type = Some("max"),
-      kernel = Some(Shape(2, 2)), stride = Some(Shape(2, 2)))
-    var d5 = Symbol.api.Flatten(data = Some(pool2))
-    d5 = Symbol.api.FullyConnected(data = Some(d5), num_hidden = 500, name = "fc1")
-    d5 = Symbol.api.Activation(data = Some(d5), act_type = "tanh")
-    d5 = Symbol.api.FullyConnected(data = Some(d5), num_hidden = 1, name = "fc_dloss")
-    val dloss = Symbol.api.LogisticRegressionOutput(data = Some(d5), name = "dloss")
-
-    (gout, dloss)
-  }
-
-  // Evaluation
-  def ferr(label: NDArray, pred: NDArray): Float = {
-    val predArr = pred.toArray.map(p => if (p > 0.5) 1f else 0f)
-    val labelArr = label.toArray
-    labelArr.zip(predArr).map { case (l, p) => Math.abs(l - p) }.sum / label.shape(0)
-  }
-
-  def runTraining(dataPath : String, context : Context,
-                  outputPath : String, numEpoch : Int): Float = {
-    val output = ResourceScope.using() {
-      val lr = 0.0005f
-      val beta1 = 0.5f
-      val batchSize = 100
-      val randShape = Shape(batchSize, 100)
-      val dataShape = Shape(batchSize, 1, 28, 28)
-
-      val (symGen, symDec) =
-        makeDcganSym(oShape = dataShape, ngf = 32, finalAct = "sigmoid")
-
-      val gMod = new GANModule(
-        symGen,
-        symDec,
-        context = context,
-        dataShape = dataShape,
-        codeShape = randShape)
-
-      gMod.initGParams(new Xavier(factorType = "in", magnitude = 2.34f))
-      gMod.initDParams(new Xavier(factorType = "in", magnitude = 2.34f))
-
-      gMod.initOptimizer(new Adam(learningRate = lr, wd = 0f, beta1 = beta1))
-
-      val params = Map(
-        "image" -> s"$dataPath/train-images-idx3-ubyte",
-        "label" -> s"$dataPath/train-labels-idx1-ubyte",
-        "input_shape" -> s"(1, 28, 28)",
-        "batch_size" -> s"$batchSize",
-        "shuffle" -> "True"
-      )
-
-      val mnistIter = IO.MNISTIter(params)
-
-      val metricAcc = new CustomMetric(ferr, "ferr")
-
-      var t = 0
-      var dataBatch: DataBatch = null
-      var acc = 0.0f
-      for (epoch <- 0 until numEpoch) {
-        mnistIter.reset()
-        metricAcc.reset()
-        t = 0
-        while (mnistIter.hasNext) {
-          dataBatch = mnistIter.next()
-          ResourceScope.using() {
-            gMod.update(dataBatch)
-            gMod.dLabel.set(0f)
-            metricAcc.update(Array(gMod.dLabel), gMod.outputsFake)
-            gMod.dLabel.set(1f)
-            metricAcc.update(Array(gMod.dLabel), gMod.outputsReal)
-
-            if (t % 50 == 0) {
-              val (name, value) = metricAcc.get
-              acc = value(0)
-              logger.info(s"epoch: $epoch, iter $t, metric=${value.mkString(" ")}")
-              Viz.imSave("gout", outputPath, gMod.tempOutG(0), flip = true)
-              val diff = gMod.tempDiffD
-              val arr = diff.toArray
-              val mean = arr.sum / arr.length
-              val std = {
-                val tmpA = arr.map(a => (a - mean) * (a - mean))
-                Math.sqrt(tmpA.sum / tmpA.length).toFloat
-              }
-              diff.set((diff - mean) / std + 0.5f)
-              Viz.imSave("diff", outputPath, diff, flip = true)
-              Viz.imSave("data", outputPath, dataBatch.data(0), flip = true)
-            }
-          }
-          dataBatch.dispose()
-          t += 1
-        }
-      }
-      acc
-    }
-    output
-  }
-
-  def main(args: Array[String]): Unit = {
-    val anst = new GanMnist
-    val parser: CmdLineParser = new CmdLineParser(anst)
-    try {
-      parser.parseArgument(args.toList.asJava)
-
-      val dataPath = if (anst.mnistDataPath == null) System.getenv("MXNET_HOME")
-      else anst.mnistDataPath
-
-      assert(dataPath != null)
-      val context = if (anst.gpu == -1) Context.cpu() else Context.gpu(anst.gpu)
-
-      runTraining(dataPath, context, anst.outputPath, 100)
-    } catch {
-      case ex: Exception => {
-        logger.error(ex.getMessage, ex)
-        parser.printUsage(System.err)
-        sys.exit(1)
-      }
-    }
-  }
-}
-
-class GanMnist {
-  @Option(name = "--mnist-data-path", usage = "the mnist data path")
-  private val mnistDataPath: String = null
-  @Option(name = "--output-path", usage = "the path to save the generated result")
-  private val outputPath: String = null
-  @Option(name = "--gpu", usage = "which gpu card to use, default is -1, means using cpu")
-  private val gpu: Int = -1
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/gan/Module.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/gan/Module.scala
deleted file mode 100644
index 55b52965230b..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/gan/Module.scala
+++ /dev/null
@@ -1,162 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.gan
-
-import org.apache.mxnet.Symbol
-import org.apache.mxnet.Context
-import org.apache.mxnet.Shape
-import org.apache.mxnet.Optimizer
-import org.apache.mxnet.NDArray
-import org.apache.mxnet.Initializer
-import org.apache.mxnet.DataBatch
-import org.apache.mxnet.Random
-
-class GANModule(
-              symbolGenerator: Symbol,
-              symbolEncoder: Symbol,
-              context: Context,
-              dataShape: Shape,
-              codeShape: Shape,
-              posLabel: Float = 0.9f) {
-
-  // generator
-  private val gDataLabelShape = Map("rand" -> codeShape)
-  private val (gArgShapes, gOutShapes, gAuxShapes) = symbolGenerator.inferShape(gDataLabelShape)
-
-  private val gArgNames = symbolGenerator.listArguments()
-  private val gArgDict = gArgNames.zip(gArgShapes.map(NDArray.empty(_, context))).toMap
-
-  private val gGradDict = gArgNames.zip(gArgShapes).filter { case (name, shape) =>
-    !gDataLabelShape.contains(name)
-  }.map(x => x._1 -> NDArray.empty(x._2, context) ).toMap
-
-  private val gData = gArgDict("rand")
-
-  val gAuxNames = symbolGenerator.listAuxiliaryStates()
-  val gAuxDict = gAuxNames.zip(gAuxShapes.map(NDArray.empty(_, context))).toMap
-  private val gExecutor =
-    symbolGenerator.bind(context, gArgDict, gGradDict, "write", gAuxDict, null, null)
-
-  // discriminator
-  private val batchSize = dataShape(0)
-
-  private val dDataShape = Map("data" -> dataShape)
-  private val dLabelShape = Map("dloss_label" -> Shape(batchSize))
-  private val (dArgShapes, _, dAuxShapes) = symbolEncoder.inferShape(dDataShape ++ dLabelShape)
-
-  private val dArgNames = symbolEncoder.listArguments()
-  private val dArgDict = dArgNames.zip(dArgShapes.map(NDArray.empty(_, context))).toMap
-
-  private val dGradDict = dArgNames.zip(dArgShapes).filter { case (name, shape) =>
-    !dLabelShape.contains(name)
-  }.map(x => x._1 -> NDArray.empty(x._2, context) ).toMap
-
-  private val tempGradD = dArgNames.zip(dArgShapes).filter { case (name, shape) =>
-    !dLabelShape.contains(name)
-  }.map(x => x._1 -> NDArray.empty(x._2, context) ).toMap
-
-  private val dData = dArgDict("data")
-  val dLabel = dArgDict("dloss_label")
-
-  val dAuxNames = symbolEncoder.listAuxiliaryStates()
-  val dAuxDict = dAuxNames.zip(dAuxShapes.map(NDArray.empty(_, context))).toMap
-  private val dExecutor =
-    symbolEncoder.bind(context, dArgDict, dGradDict, "write", dAuxDict, null, null)
-
-  val tempOutG = gOutShapes.map(NDArray.empty(_, context)).toArray
-  val tempDiffD: NDArray = dGradDict("data")
-
-  var outputsFake: Array[NDArray] = null
-  var outputsReal: Array[NDArray] = null
-
-  def initGParams(initializer: Initializer): Unit = {
-    gArgDict.filter(x => !gDataLabelShape.contains(x._1))
-                   .foreach { case (name, ndArray) => initializer(name, ndArray) }
-  }
-
-  def initDParams(initializer: Initializer): Unit = {
-    dArgDict.filter(x => !dDataShape.contains(x._1) && !dLabelShape.contains(x._1))
-                   .foreach { case (name, ndArray) => initializer(name, ndArray) }
-  }
-
-  private var gOpt: Optimizer = null
-  private var gParamsGrads: List[(Int, String, NDArray, AnyRef)] = null
-  private var dOpt: Optimizer = null
-  private var dParamsGrads: List[(Int, String, NDArray, AnyRef)] = null
-
-  def initOptimizer(opt: Optimizer): Unit = {
-    gOpt = opt
-    gParamsGrads = gGradDict.toList.zipWithIndex.map { case ((name, grad), idx) =>
-      (idx, name, grad, gOpt.createState(idx, gArgDict(name)))
-    }
-    dOpt = opt
-    dParamsGrads =
-      dGradDict.filter(x => !dDataShape.contains(x._1))
-      .toList.zipWithIndex.map { case ((name, grad), idx) =>
-        (idx, name, grad, dOpt.createState(idx, dArgDict(name)))
-    }
-  }
-
-  private def saveTempGradD(): Unit = {
-    val keys = this.dGradDict.keys
-    for (k <- keys) {
-      this.dGradDict(k).copyTo(this.tempGradD(k))
-    }
-  }
-
-  // add back saved gradient
-  private def addTempGradD(): Unit = {
-    val keys = this.dGradDict.keys
-    for (k <- keys) {
-      this.dGradDict(k).set(this.dGradDict(k) + this.tempGradD(k))
-    }
-  }
-
-  // update the model for a single batch
-  def update(dBatch: DataBatch): Unit = {
-    // generate fake image
-    this.gData.set(Random.normal(0, 1.0f, this.gData.shape, context))
-    this.gExecutor.forward(isTrain = true)
-    val outG = this.gExecutor.outputs(0)
-    this.dLabel.set(0f)
-    this.dData.set(outG)
-    this.dExecutor.forward(isTrain = true)
-    this.dExecutor.backward()
-    this.saveTempGradD()
-    // update generator
-    this.dLabel.set(1f)
-    this.dExecutor.forward(isTrain = true)
-    this.dExecutor.backward()
-    this.gExecutor.backward(tempDiffD)
-    gParamsGrads.foreach { case (idx, name, grad, optimState) =>
-      gOpt.update(idx, gArgDict(name), grad, optimState)
-    }
-    this.outputsFake = this.dExecutor.outputs.map(x => x.copy())
-    // update discriminator
-    this.dLabel.set(posLabel)
-    this.dData.set(dBatch.data(0))
-    this.dExecutor.forward(isTrain = true)
-    this.dExecutor.backward()
-    this.addTempGradD()
-    dParamsGrads.foreach { case (idx, name, grad, optimState) =>
-      dOpt.update(idx, dArgDict(name), grad, optimState)
-    }
-    this.outputsReal = this.dExecutor.outputs.map(x => x.copy())
-    this.tempOutG.indices.foreach(i => this.tempOutG(i).set(this.gExecutor.outputs(i)))
-  }
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/gan/README.md b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/gan/README.md
deleted file mode 100644
index a4536a7662e4..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/gan/README.md
+++ /dev/null
@@ -1,35 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# GAN MNIST Example for Scala
-This is the GAN MNIST Training Example implemented for Scala type-safe api
-
-This example is only for Illustration and not modeled to achieve the best accuracy.
-## Setup
-### Download the source File
-```$xslt
-https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/mnist/mnist.zip
-```
-### Unzip the file
-```$xslt
-unzip mnist.zip
-```
-### Arguement Configuration
-Then you need to define the arguments that you would like to pass in the model:
-```$xslt
---mnist-data-path <location of your downloaded file>
-```
\ No newline at end of file
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/gan/Viz.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/gan/Viz.scala
deleted file mode 100644
index c58f44da474a..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/gan/Viz.scala
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.gan
-
-import org.opencv.core.Core
-import org.opencv.highgui.Highgui
-import org.opencv.imgproc.Imgproc
-import org.apache.mxnet.NDArray
-import org.opencv.core.Mat
-import org.opencv.core.CvType
-import java.util.ArrayList
-import org.opencv.core.Size
-
-object Viz {
-
-  nu.pattern.OpenCV.loadShared()
-
-  private def clip(x: Array[Float]): Array[Byte] = {
-    x.map(_ * 255f).map(x => if (x < 0f) 0 else if (x > 255f) 255 else x.toInt)
-      .map(_.toByte)
-  }
-
-  private def getImg(rawData: Array[Byte],
-    channels: Int, height: Int, width: Int, flip: Boolean): Mat = {
-    val totals = height * width
-     val img = if (channels > 1) { // rbg image
-      val (rA, gA, bA) = {
-        val tmp = rawData.grouped(totals).toArray
-        (tmp(0), tmp(1), tmp(2))
-      }
-
-      val rr = new Mat(height, width, CvType.CV_8U)
-      rr.put(0, 0, rA)
-      val gg = new Mat(height, width, CvType.CV_8U)
-      gg.put(0, 0, gA)
-      val bb = new Mat(height, width, CvType.CV_8U)
-      bb.put(0, 0, bA)
-
-      val result = new Mat()
-      val layers = new ArrayList[Mat]()
-      layers.add(bb)
-      layers.add(gg)
-      layers.add(rr)
-      Core.merge(layers, result)
-      result
-    } else { // gray image
-      val result = new Mat(height, width, CvType.CV_8U)
-      result.put(0, 0, rawData)
-      result
-    }
-    if (flip) {
-      val result = new Mat()
-      Core.flip(img, result, 0)
-      result
-    } else img
-  }
-
-  def imSave(title: String, outputPath: String, x: NDArray, flip: Boolean = false): Unit = {
-    val shape = x.shape
-    assert(shape.length == 4)
-
-    val (n, c, h, w) = (shape(0), shape(1), shape(2), shape(3))
-
-    val totals = h * w
-    val rawData = clip(x.toArray)
-
-    val img = {
-      val row, col = Math.sqrt(n).toInt
-      val lineArrs = rawData.grouped(col * c * totals)
-
-      val lineMats = new ArrayList[Mat]()
-
-      for (line <- lineArrs) {
-        val imgArr = line.grouped(c * totals)
-        val colMats = new Mat
-        val src = new ArrayList[Mat]()
-
-        for(arr <- imgArr) src.add(getImg(arr, c, h, w, flip))
-
-        Core.hconcat(src, colMats)
-        lineMats.add(colMats)
-      }
-      val result = new Mat()
-      Core.vconcat(lineMats, result)
-      result
-    }
-    val resizedImg = new Mat
-    Imgproc.resize(img, resizedImg, new Size(img.width() * 1.5, img.height() * 1.5))
-    Highgui.imwrite(s"$outputPath/$title.jpg", resizedImg)
-  }
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/README.md b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/README.md
deleted file mode 100644
index e533130aa71c..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/README.md
+++ /dev/null
@@ -1,53 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# Image Classification Models
-
-This examples contains a number of image classification models that can be run on various datasets.
-
-## Models
-
-Currently, the following models are supported:
-- MultiLayerPerceptron
-- Lenet
-- Resnet
-
-## Datasets
-
-Currently, the following datasets are supported:
-- MNIST
-
-#### Synthetic Benchmark Data
-
-Additionally, the datasets can be replaced by randomly generated data for benchmarking.
-Data is produced to match the shapes of the supported datasets above.
-
-The following additional dataset image shapes are also defined for use with the benchmark synthetic data:
-- imagenet
-
-
-
-## Setup
-
-### MNIST
-
-For this dataset, the data must be downloaded and extracted from the source or 
-```$xslt
-https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/mnist/mnist.zip
-```
-
-Afterwards, the location of the data folder must be passed in through the `--data-dir` argument.
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/TrainModel.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/TrainModel.scala
deleted file mode 100644
index 9f0430eaada6..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/TrainModel.scala
+++ /dev/null
@@ -1,222 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.imclassification
-
-import java.util.concurrent._
-
-import org.apache.mxnet.DType.DType
-import org.apache.mxnetexamples.imclassification.models._
-import org.apache.mxnetexamples.imclassification.util.Trainer
-import org.apache.mxnet._
-import org.apache.mxnetexamples.imclassification.datasets.{MnistIter, SyntheticDataIter}
-import org.kohsuke.args4j.{CmdLineParser, Option}
-import org.slf4j.LoggerFactory
-
-import scala.collection.JavaConverters._
-import scala.collection.mutable
-
-object TrainModel {
-  private val logger = LoggerFactory.getLogger(classOf[TrainModel])
-
-  /**
-    * Simple model training and execution
-    * @param model The model identifying string
-    * @param dataPath Path to location of image data
-    * @param numExamples Number of image data examples
-    * @param numEpochs Number of epochs to train for
-    * @param benchmark Whether to use benchmark synthetic data instead of real image data
-    * @return The final validation accuracy
-    */
-  def test(model: String, dataPath: String, numExamples: Int = 60000,
-           numEpochs: Int = 10, benchmark: Boolean = false,
-           dtype: DType = DType.Float32): Float = {
-    ResourceScope.using() {
-      val devs = Array(Context.cpu(0))
-      val envs: mutable.Map[String, String] = mutable.HashMap.empty[String, String]
-      val (dataLoader, net) = dataLoaderAndModel("mnist", model, dataPath,
-        numExamples = numExamples, benchmark = benchmark, dtype = dtype)
-      val Acc = Trainer.fit(batchSize = 128, numExamples, devs = devs,
-        network = net, dataLoader = dataLoader,
-        kvStore = "local", numEpochs = numEpochs)
-      logger.info("Finish test fit ...")
-      val (_, num) = Acc.get
-      num(0)
-    }
-  }
-
-  /**
-    * Gets dataset iterator and model symbol
-    * @param dataset The dataset identifying string
-    * @param model The model identifying string
-    * @param dataDir Path to location of image data
-    * @param numLayers The number of model layers (resnet only)
-    * @param numExamples The number of examples in the dataset
-    * @param benchmark Whether to use benchmark synthetic data instead of real image data
-    * @return Data iterator (partially applied function) and model symbol
-    */
-  def dataLoaderAndModel(dataset: String, model: String, dataDir: String = "",
-                         numLayers: Int = 50, numExamples: Int = 60000,
-                         benchmark: Boolean = false, dtype: DType = DType.Float32
-                        ): ((Int, KVStore) => (DataIter, DataIter), Symbol) = {
-    val (imageShape, numClasses) = dataset match {
-      case "mnist" => (List(1, 28, 28), 10)
-      case "imagenet" => (List(3, 224, 224), 1000)
-      case _ => throw new Exception("Invalid image data collection")
-    }
-
-    val List(channels, height, width) = imageShape
-    val dataSize: Int = channels * height * width
-    val (datumShape, net) = model match {
-      case "mlp" => (List(dataSize), MultiLayerPerceptron.getSymbol(numClasses, dtype = dtype))
-      case "lenet" => (List(channels, height, width), Lenet.getSymbol(numClasses, dtype = dtype))
-      case "resnet" => (List(channels, height, width), Resnet.getSymbol(numClasses,
-        numLayers, imageShape, dtype = dtype))
-      case _ => throw new Exception("Invalid model name")
-    }
-
-    val dataLoader: (Int, KVStore) => (DataIter, DataIter) = if (benchmark) {
-      (batchSize: Int, kv: KVStore) => {
-        val iter = new SyntheticDataIter(numClasses, batchSize, datumShape, List(), numExamples,
-          dtype)
-        (iter, iter)
-      }
-    } else {
-      dataset match {
-        case "mnist" => MnistIter.getIterator(Shape(datumShape), dataDir)
-        case _ => throw new Exception("This image data collection only supports the"
-          + "synthetic benchmark iterator.  Use --benchmark to enable")
-      }
-    }
-    (dataLoader, net)
-  }
-
-  /**
-    * Runs image classification training from CLI with various options
-    * @param args CLI args
-    */
-  def main(args: Array[String]): Unit = {
-    val inst = new TrainModel
-    val parser: CmdLineParser = new CmdLineParser(inst)
-    try {
-      ResourceScope.using() {
-        parser.parseArgument(args.toList.asJava)
-
-        val dataPath = if (inst.dataDir == null) System.getenv("MXNET_HOME")
-        else inst.dataDir
-
-        val dtype = DType.withName(inst.dType)
-
-        val (dataLoader, net) = dataLoaderAndModel(inst.dataset, inst.network, dataPath,
-          inst.numLayers, inst.numExamples, inst.benchmark, dtype)
-
-        val devs =
-          if (inst.gpus != null) inst.gpus.split(',').map(id => Context.gpu(id.trim.toInt))
-          else if (inst.cpus != null) inst.cpus.split(',').map(id => Context.cpu(id.trim.toInt))
-          else Array(Context.cpu(0))
-
-        val envs: mutable.Map[String, String] = mutable.HashMap.empty[String, String]
-        envs.put("DMLC_ROLE", inst.role)
-        if (inst.schedulerHost != null) {
-          require(inst.schedulerPort > 0, "scheduler port not specified")
-          envs.put("DMLC_PS_ROOT_URI", inst.schedulerHost)
-          envs.put("DMLC_PS_ROOT_PORT", inst.schedulerPort.toString)
-          require(inst.numWorker > 0, "Num of workers must > 0")
-          envs.put("DMLC_NUM_WORKER", inst.numWorker.toString)
-          require(inst.numServer > 0, "Num of servers must > 0")
-          envs.put("DMLC_NUM_SERVER", inst.numServer.toString)
-          logger.info("Init PS environments")
-          KVStoreServer.init(envs.toMap)
-        }
-
-        if (inst.role != "worker") {
-          logger.info("Start KVStoreServer for scheduler & servers")
-          KVStoreServer.start()
-        } else {
-          Trainer.fit(batchSize = inst.batchSize, numExamples = inst.numExamples, devs = devs,
-            network = net, dataLoader = dataLoader,
-            kvStore = inst.kvStore, numEpochs = inst.numEpochs,
-            modelPrefix = inst.modelPrefix, loadEpoch = inst.loadEpoch,
-            lr = inst.lr, lrFactor = inst.lrFactor, lrFactorEpoch = inst.lrFactorEpoch,
-            monitorSize = inst.monitor)
-          logger.info("Finish fit ...")
-        }
-      }
-    } catch {
-      case ex: Exception => {
-        logger.error(ex.getMessage, ex)
-        parser.printUsage(System.err)
-        sys.exit(1)
-      }
-    }
-  }
-}
-
-class TrainModel {
-  @Option(name = "--network", usage = "the cnn to use: ['mlp', 'lenet', 'resnet']")
-  private val network: String = "mlp"
-  @Option(name = "--num-layers", usage = "the number of resnet layers to use")
-  private val numLayers: Int = 50
-  @Option(name = "--data-dir", usage = "the input data directory")
-  private val dataDir: String = "mnist/"
-
-  @Option(name = "--dataset", usage = "the images to classify: ['mnist', 'imagenet']")
-  private val dataset: String = "mnist"
-  @Option(name = "--benchmark", usage = "Benchmark to use synthetic data to measure performance")
-  private val benchmark: Boolean = false
-
-  @Option(name = "--gpus", usage = "the gpus will be used, e.g. '0,1,2,3'")
-  private val gpus: String = null
-  @Option(name = "--cpus", usage = "the cpus will be used, e.g. '0,1,2,3'")
-  private val cpus: String = null
-  @Option(name = "--num-examples", usage = "the number of training examples")
-  private val numExamples: Int = 60000
-  @Option(name = "--batch-size", usage = "the batch size")
-  private val batchSize: Int = 128
-  @Option(name = "--lr", usage = "the initial learning rate")
-  private val lr: Float = 0.1f
-  @Option(name = "--model-prefix", usage = "the prefix of the model to load/save")
-  private val modelPrefix: String = null
-  @Option(name = "--num-epochs", usage = "the number of training epochs")
-  private val numEpochs = 10
-  @Option(name = "--load-epoch", usage = "load the model on an epoch using the model-prefix")
-  private val loadEpoch: Int = -1
-  @Option(name = "--kv-store", usage = "the kvstore type")
-  private val kvStore = "local"
-  @Option(name = "--lr-factor",
-    usage = "times the lr with a factor for every lr-factor-epoch epoch")
-  private val lrFactor: Float = 1f
-  @Option(name = "--lr-factor-epoch", usage = "the number of epoch to factor the lr, could be .5")
-  private val lrFactorEpoch: Float = 1f
-  @Option(name = "--monitor", usage = "monitor the training process every N batch")
-  private val monitor: Int = -1
-
-  @Option(name = "--role", usage = "scheduler/server/worker")
-  private val role: String = "worker"
-  @Option(name = "--scheduler-host", usage = "Scheduler hostname / ip address")
-  private val schedulerHost: String = null
-  @Option(name = "--scheduler-port", usage = "Scheduler port")
-  private val schedulerPort: Int = 0
-  @Option(name = "--num-worker", usage = "# of workers")
-  private val numWorker: Int = 1
-  @Option(name = "--num-server", usage = "# of servers")
-  private val numServer: Int = 1
-  @Option(name = "--dtype", usage = "data type of the model to train. " +
-    "Can be float32/float64. Works only with synthetic data currently")
-  private val dType: String = "float32"
-}
-
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/datasets/MnistIter.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/datasets/MnistIter.scala
deleted file mode 100644
index 9e6e1c2a3269..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/datasets/MnistIter.scala
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.imclassification.datasets
-
-import org.apache.mxnet._
-
-object MnistIter {
-  /**
-    * Returns an iterator over the MNIST dataset
-    * @param dataShape Image size (channels, height, width)
-    * @param dataDir The path to the image data
-    * @param batchSize Number of images per batch
-    * @param kv KVStore to use
-    * @return
-    */
-  def getIterator(dataShape: Shape, dataDir: String)
-                 (batchSize: Int, kv: KVStore): (DataIter, DataIter) = {
-    val flat = if (dataShape.size == 3) "False" else "True"
-
-    val train = IO.MNISTIter(Map(
-      "image" -> (dataDir + "train-images-idx3-ubyte"),
-      "label" -> (dataDir + "train-labels-idx1-ubyte"),
-      "label_name" -> "softmax_label",
-      "input_shape" -> dataShape.toString,
-      "batch_size" -> batchSize.toString,
-      "shuffle" -> "True",
-      "flat" -> flat,
-      "num_parts" -> kv.numWorkers.toString,
-      "part_index" -> kv.`rank`.toString))
-
-    val eval = IO.MNISTIter(Map(
-      "image" -> (dataDir + "t10k-images-idx3-ubyte"),
-      "label" -> (dataDir + "t10k-labels-idx1-ubyte"),
-      "label_name" -> "softmax_label",
-      "input_shape" -> dataShape.toString,
-      "batch_size" -> batchSize.toString,
-      "flat" -> flat,
-      "num_parts" -> kv.numWorkers.toString,
-      "part_index" -> kv.`rank`.toString))
-
-    (train, eval)
-  }
-
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/datasets/SyntheticDataIter.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/datasets/SyntheticDataIter.scala
deleted file mode 100644
index 4d22b62bea81..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/datasets/SyntheticDataIter.scala
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.imclassification.datasets
-
-import org.apache.mxnet.DType.DType
-import org.apache.mxnet._
-
-import scala.collection.immutable.ListMap
-import scala.util.Random
-
-class SyntheticDataIter(numClasses: Int, val batchSize: Int, datumShape: List[Int],
-                        labelShape: List[Int], maxIter: Int, dType: DType = DType.Float32
-                       ) extends DataIter {
-  var curIter = 0
-  val random = new Random()
-  val shape = Shape(batchSize :: datumShape)
-  val batchLabelShape = Shape(batchSize :: labelShape)
-
-  val maxLabel = if (labelShape.isEmpty) numClasses.toFloat else 1f
-  var label: IndexedSeq[NDArray] = IndexedSeq(
-    NDArray.api.random_uniform(Some(0f), Some(maxLabel), shape = Some(batchLabelShape)))
-  var data: IndexedSeq[NDArray] = IndexedSeq(
-    NDArray.api.random_uniform(shape = Some(shape), dtype = Some(dType.toString)))
-
-  val provideDataDesc: IndexedSeq[DataDesc] = IndexedSeq(
-    new DataDesc("data", shape, data(0).dtype, Layout.UNDEFINED))
-  val provideLabelDesc: IndexedSeq[DataDesc] = IndexedSeq(
-    new DataDesc("softmax_label", batchLabelShape, label(0).dtype, Layout.UNDEFINED))
-  val getPad: Int = 0
-
-  override def getData(): IndexedSeq[NDArray] = data
-
-  override def getIndex: IndexedSeq[Long] = IndexedSeq(curIter)
-
-  override def getLabel: IndexedSeq[NDArray] = label
-
-  override def hasNext: Boolean = curIter < maxIter - 1
-
-  override def next(): DataBatch = {
-    if (hasNext) {
-      curIter += batchSize
-      new DataBatch(data, label, getIndex, getPad)
-    } else {
-      throw new NoSuchElementException
-    }
-  }
-
-  override def reset(): Unit = {
-    curIter = 0
-  }
-
-  override def provideData: ListMap[String, Shape] = ListMap("data" -> shape)
-
-  override def provideLabel: ListMap[String, Shape] = ListMap("softmax_label" -> batchLabelShape)
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/models/Lenet.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/models/Lenet.scala
deleted file mode 100644
index 6f8b138d5ccb..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/models/Lenet.scala
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.imclassification.models
-
-import org.apache.mxnet.DType.DType
-import org.apache.mxnet._
-
-object Lenet {
-
-  /**
-    * Gets Lenet Model Symbol
-    * @param numClasses Number of classes to classify into
-    * @return model symbol
-    */
-  def getSymbol(numClasses: Int, dtype: DType = DType.Float32): Symbol = {
-    val data = Symbol.Variable("data", dType = dtype)
-    // first conv
-    val conv1 = Symbol.api.Convolution(data = Some(data), kernel = Shape(5, 5), num_filter = 20)
-    val tanh1 = Symbol.api.tanh(data = Some(conv1))
-    val pool1 = Symbol.api.Pooling(data = Some(tanh1), pool_type = Some("max"),
-      kernel = Some(Shape(2, 2)), stride = Some(Shape(2, 2)))
-    // second conv
-    val conv2 = Symbol.api.Convolution(data = Some(pool1), kernel = Shape(5, 5), num_filter = 50)
-    val tanh2 = Symbol.api.tanh(data = Some(conv2))
-    val pool2 = Symbol.api.Pooling(data = Some(tanh2), pool_type = Some("max"),
-      kernel = Some(Shape(2, 2)), stride = Some(Shape(2, 2)))
-    // first fullc
-    val flatten = Symbol.api.Flatten(data = Some(pool2))
-    val fc1 = Symbol.api.FullyConnected(data = Some(flatten), num_hidden = 500)
-    val tanh3 = Symbol.api.tanh(data = Some(fc1))
-    // second fullc
-    val fc2 = Symbol.api.FullyConnected(data = Some(tanh3), num_hidden = numClasses)
-    // loss
-    val lenet = Symbol.api.SoftmaxOutput(name = "softmax", data = Some(fc2))
-    lenet
-  }
-
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/models/MultiLayerPerceptron.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/models/MultiLayerPerceptron.scala
deleted file mode 100644
index 089b65f24a65..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/models/MultiLayerPerceptron.scala
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.imclassification.models
-
-import org.apache.mxnet.DType.DType
-import org.apache.mxnet._
-
-object MultiLayerPerceptron {
-
-  /**
-    * Gets MultiLayer Perceptron Model Symbol
-    * @param numClasses Number of classes to classify into
-    * @return model symbol
-    */
-  def getSymbol(numClasses: Int, dtype: DType = DType.Float32): Symbol = {
-    val data = Symbol.Variable("data", dType = dtype)
-
-    val fc1 = Symbol.api.FullyConnected(data = Some(data), num_hidden = 128, name = "fc1")
-    val act1 = Symbol.api.Activation(data = Some(fc1), "relu", name = "relu")
-    val fc2 = Symbol.api.FullyConnected(Some(act1), None, None, 64, name = "fc2")
-    val act2 = Symbol.api.Activation(data = Some(fc2), "relu", name = "relu2")
-    val fc3 = Symbol.api.FullyConnected(Some(act2), None, None, numClasses, name = "fc3")
-    val mlp = Symbol.api.SoftmaxOutput(name = "softmax", data = Some(fc3))
-    mlp
-  }
-
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/models/Resnet.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/models/Resnet.scala
deleted file mode 100644
index e5f597680f99..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/models/Resnet.scala
+++ /dev/null
@@ -1,180 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.imclassification.models
-
-import org.apache.mxnet.DType.DType
-import org.apache.mxnet._
-
-object Resnet {
-  /**
-    * Helper to produce individual residual unit
-    */
-  def residualUnit(data: Symbol, numFilter: Int, stride: Shape, dimMatch: Boolean,
-                   name: String = "", bottleNeck: Boolean = true, bnMom: Float = 0.9f,
-                   workspace: Int = 256, memonger: Boolean = false): Symbol = {
-    val (act1, operated) = if (bottleNeck) {
-      val bn1 = Symbol.api.BatchNorm(data = Some(data), fix_gamma = Some(false), eps = Some(2e-5),
-        momentum = Some(bnMom), name = name + "_bn1")
-      val act1: Symbol = Symbol.api.Activation(data = Some(bn1), act_type = "relu",
-        name = name + "_relu1")
-      val conv1 = Symbol.api.Convolution(data = Some(act1), num_filter = (numFilter * 0.25).toInt,
-        kernel = Shape(1, 1), stride = Some(Shape(1, 1)), pad = Some(Shape(0, 0)),
-        no_bias = Some(true), workspace = Some(workspace), name = name + "_conv1")
-      val bn2 = Symbol.api.BatchNorm(data = Some(conv1), fix_gamma = Some(false),
-        eps = Some(2e-5), momentum = Some(bnMom), name = name + "_bn2")
-      val act2 = Symbol.api.Activation(data = Some(bn2), act_type = "relu", name = name + "_relu2")
-      val conv2 = Symbol.api.Convolution(data = Some(act2), num_filter = (numFilter * 0.25).toInt,
-        kernel = Shape(3, 3), stride = Some(stride), pad = Some(Shape(1, 1)),
-        no_bias = Some(true), workspace = Some(workspace), name = name + "_conv2")
-      val bn3 = Symbol.api.BatchNorm(data = Some(conv2), fix_gamma = Some(false),
-        eps = Some(2e-5), momentum = Some(bnMom), name = name + "_bn3")
-      val act3 = Symbol.api.Activation(data = Some(bn3), act_type = "relu", name = name + "_relu3")
-      val conv3 = Symbol.api.Convolution(data = Some(act3), num_filter = numFilter,
-        kernel = Shape(1, 1), stride = Some(Shape(1, 1)), pad = Some(Shape(0, 0)),
-        no_bias = Some(true), workspace = Some(workspace), name = name + "_conv3")
-      (act1, conv3)
-    } else {
-      val bn1 = Symbol.api.BatchNorm(data = Some(data), fix_gamma = Some(false),
-        eps = Some(2e-5), momentum = Some(bnMom), name = name + "_bn1")
-      val act1 = Symbol.api.Activation(data = Some(bn1), act_type = "relu", name = name + "_relu1")
-      val conv1 = Symbol.api.Convolution(data = Some(act1), num_filter = numFilter,
-        kernel = Shape(3, 3), stride = Some(stride), pad = Some(Shape(1, 1)),
-        no_bias = Some(true), workspace = Some(workspace), name = name + "_conv1")
-      val bn2 = Symbol.api.BatchNorm(data = Some(conv1), fix_gamma = Some(false),
-        eps = Some(2e-5), momentum = Some(bnMom), name = name + "_bn2")
-      val act2 = Symbol.api.Activation(data = Some(bn2), act_type = "relu", name = name + "_relu2")
-      val conv2 = Symbol.api.Convolution(data = Some(act2), num_filter = numFilter,
-        kernel = Shape(3, 3), stride = Some(Shape(1, 1)), pad = Some(Shape(1, 1)),
-        no_bias = Some(true), workspace = Some(workspace), name = name + "_conv2")
-      (act1, conv2)
-    }
-    val shortcut = if (dimMatch) {
-      data
-    } else {
-      Symbol.api.Convolution(Some(act1), num_filter = numFilter, kernel = Shape(1, 1),
-        stride = Some(stride), no_bias = Some(true), workspace = Some(workspace),
-        name = name + "_sc")
-    }
-    operated + shortcut
-  }
-
-  /**
-    * Helper for building the resnet Symbol
-    */
-  def resnet(units: List[Int], numStages: Int, filterList: List[Int], numClasses: Int,
-             imageShape: List[Int], bottleNeck: Boolean = true, bnMom: Float = 0.9f,
-             workspace: Int = 256, dtype: DType = DType.Float32,
-             memonger: Boolean = false): Symbol = {
-    assert(units.size == numStages)
-    var data = Symbol.Variable("data", shape = Shape(List(4) ::: imageShape), dType = DType.Float32)
-    if (dtype == DType.Float32) {
-      data = Symbol.api.identity(Some(data), "id")
-    } else if (dtype == DType.Float16) {
-      data = Symbol.api.cast(Some(data), DType.Float16.toString)
-    }
-    data = Symbol.api.BatchNorm(Some(data), fix_gamma = Some(true), eps = Some(2e-5),
-      momentum = Some(bnMom), name = "bn_data")
-    val List(channels, height, width) = imageShape
-    var body = if (height <= 32) {
-      Symbol.api.Convolution(Some(data), num_filter = filterList.head, kernel = Shape(7, 7),
-        stride = Some(Shape(1, 1)), pad = Some(Shape(1, 1)), no_bias = Some(true), name = "conv0",
-        workspace = Some(workspace))
-    } else {
-      var body0 = Symbol.api.Convolution(Some(data), num_filter = filterList.head,
-        kernel = Shape(3, 3), stride = Some(Shape(2, 2)), pad = Some(Shape(3, 3)),
-        no_bias = Some(true), name = "conv0", workspace = Some(workspace))
-      body0 = Symbol.api.BatchNorm(Some(body0), fix_gamma = Some(false), eps = Some(2e-5),
-        momentum = Some(bnMom), name = "bn0")
-      body0 = Symbol.api.Activation(Some(body0), act_type = "relu", name = "relu0")
-      Symbol.api.Pooling(Some(body0), kernel = Some(Shape(3, 3)), stride = Some(Shape(2, 2)),
-        pad = Some(Shape(1, 1)), pool_type = Some("max"))
-    }
-    for (((filter, i), unit) <- filterList.tail.zipWithIndex.zip(units)) {
-      val stride = Shape(if (i == 0) 1 else 2, if (i == 0) 1 else 2)
-      body = residualUnit(body, filter, stride, false, name = s"stage${i + 1}_unit${1}",
-        bottleNeck = bottleNeck, workspace = workspace, memonger = memonger)
-      for (j <- 0 until unit - 1) {
-        body = residualUnit(body, filter, Shape(1, 1), true, s"stage${i + 1}_unit${j + 2}",
-          bottleNeck, workspace = workspace, memonger = memonger)
-      }
-    }
-    val bn1 = Symbol.api.BatchNorm(Some(body), fix_gamma = Some(false), eps = Some(2e-5),
-      momentum = Some(bnMom), name = "bn1")
-    val relu1 = Symbol.api.Activation(Some(bn1), act_type = "relu", name = "relu1")
-    val pool1 = Symbol.api.Pooling(Some(relu1), global_pool = Some(true),
-      kernel = Some(Shape(7, 7)), pool_type = Some("avg"), name = "pool1")
-    val flat = Symbol.api.Flatten(Some(pool1))
-    var fc1 = Symbol.api.FullyConnected(Some(flat), num_hidden = numClasses, name = "fc1")
-    if (dtype == DType.Float16) {
-      fc1 = Symbol.api.cast(Some(fc1), DType.Float32.toString)
-    }
-    Symbol.api.SoftmaxOutput(Some(fc1), name = "softmax")
-  }
-
-  /**
-    * Gets the resnet model symbol
-    * @param numClasses Number of classes to classify into
-    * @param numLayers Number of residual layers
-    * @param imageShape The image shape as List(channels, height, width)
-    * @param convWorkspace Maximum temporary workspace allowed (MB) in convolutions
-    * @param dtype Type of data (float16, float32, etc) to use during computation
-    * @return Model symbol
-    */
-  def getSymbol(numClasses: Int, numLayers: Int, imageShape: List[Int], convWorkspace: Int = 256,
-                dtype: DType = DType.Float32): Symbol = {
-    val List(channels, height, width) = imageShape
-    val (numStages, units, filterList, bottleNeck): (Int, List[Int], List[Int], Boolean) =
-      if (height <= 28) {
-        val (perUnit, filterList, bottleNeck) = if ((numLayers - 2) % 9 == 0 && numLayers > 165) {
-          (List(Math.floor((numLayers - 2) / 9).toInt),
-            List(16, 64, 128, 256),
-            true)
-        } else if ((numLayers - 2) % 6 == 0 && numLayers < 164) {
-          (List(Math.floor((numLayers - 2) / 6).toInt),
-            List(16, 16, 32, 64),
-            false)
-        } else {
-          throw new Exception(s"Invalid number of layers: ${numLayers}")
-        }
-        val numStages = 3
-        val units = (1 to numStages).map(_ => perUnit.head).toList
-        (numStages, units, filterList, bottleNeck)
-      } else {
-        val (filterList, bottleNeck) = if (numLayers >= 50) {
-          (List(64, 256, 512, 1024, 2048), true)
-        } else {
-          (List(64, 64, 128, 256, 512), false)
-        }
-        val units: List[Int] = Map(
-          18 -> List(2, 2, 2, 2),
-          34 -> List(3, 4, 6, 3),
-          50 -> List(3, 4, 6, 3),
-          101 -> List(3, 4, 23, 3),
-          152 -> List(3, 8, 36, 3),
-          200 -> List(3, 24, 36, 3),
-          269 -> List(3, 30, 48, 8)
-        ).get(numLayers) match {
-          case Some(x) => x
-          case None => throw new Exception(s"Invalid number of layers: ${numLayers}")
-        }
-        (4, units, filterList, bottleNeck)
-      }
-    resnet(units, numStages, filterList, numClasses, imageShape, bottleNeck,
-      workspace = convWorkspace, dtype = dtype)
-  }
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/util/Trainer.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/util/Trainer.scala
deleted file mode 100644
index 276816cf8c8c..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/imclassification/util/Trainer.scala
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.imclassification.util
-
-import org.apache.mxnet.Callback.Speedometer
-import org.apache.mxnet._
-import org.apache.mxnet.optimizer.SGD
-import org.slf4j.LoggerFactory
-
-object Trainer {
-  private val logger = LoggerFactory.getLogger(classOf[Trainer])
-
-  /**
-    * Fits a model
-    * @param batchSize Number of images per training batch
-    * @param numExamples Total number of image examples
-    * @param devs List of device contexts to use
-    * @param network The model to train
-    * @param dataLoader Function to get data loaders for training and validation data
-    * @param kvStore KVStore to use
-    * @param numEpochs Number of times to train on each image
-    * @param modelPrefix Prefix to model identification
-    * @param loadEpoch Loads a saved checkpoint at this epoch when set
-    * @param lr The learning rate
-    * @param lrFactor Learning rate factor (see FactorScheduler)
-    * @param lrFactorEpoch Learning rate factor epoch (see FactorScheduler)
-    * @param clipGradient Maximum gradient during optimization
-    * @param monitorSize (See Monitor)
-    * @return Final accuracy
-    */
-  // scalastyle:off parameterNum
-  def fit(batchSize: Int, numExamples: Int, devs: Array[Context],
-          network: Symbol, dataLoader: (Int, KVStore) => (DataIter, DataIter),
-          kvStore: String, numEpochs: Int, modelPrefix: String = null, loadEpoch: Int = -1,
-          lr: Float = 0.1f, lrFactor: Float = 1f, lrFactorEpoch: Float = 1f,
-          clipGradient: Float = 0f, monitorSize: Int = -1): Accuracy = {
-    // kvstore
-    ResourceScope.using()  {
-      var kv = KVStore.create(kvStore)
-
-      // load model
-      val modelPrefixWithRank =
-        if (modelPrefix == null) null
-        else modelPrefix + s"-${kv.rank}"
-
-      val (argParams, auxParams, beginEpoch) =
-        if (loadEpoch >= 0) {
-          require(modelPrefixWithRank != null)
-          val tmp = FeedForward.load(modelPrefix, loadEpoch)
-          (tmp.getArgParams, tmp.getAuxParams, loadEpoch)
-        } else {
-          (null, null, 0)
-        }
-
-      // save model
-      val checkpoint: EpochEndCallback =
-        if (modelPrefix == null) null
-        else new EpochEndCallback {
-          override def invoke(epoch: Int, symbol: Symbol,
-                              argParams: Map[String, NDArray],
-                              auxStates: Map[String, NDArray]): Unit = {
-            Model.saveCheckpoint(modelPrefix, epoch + 1, symbol, argParams, auxParams)
-          }
-        }
-
-      // data
-      val (train, validation) = dataLoader(batchSize, kv)
-
-      // train
-      val epochSize =
-        if (kvStore == "dist_sync") numExamples / batchSize / kv.numWorkers
-        else numExamples / batchSize
-
-      val lrScheduler =
-        if (lrFactor < 1f) {
-          new FactorScheduler(step = Math.max((epochSize * lrFactorEpoch).toInt, 1),
-                              factor = lrFactor)
-        } else {
-          null
-        }
-      val optimizer: Optimizer = new SGD(learningRate = lr,
-        lrScheduler = lrScheduler, clipGradient = clipGradient,
-        momentum = 0.9f, wd = 0.00001f)
-
-      // disable kvstore for single device
-      if (kv.`type`.contains("local") && (devs.length == 1 || devs(0).deviceType != "gpu")) {
-        kv.dispose()
-        kv = null
-      }
-
-      val model = new FeedForward(ctx = devs,
-                                  symbol = network,
-                                  numEpoch = numEpochs,
-                                  optimizer = optimizer,
-                                  initializer = new Xavier(factorType = "in", magnitude = 2.34f),
-                                  argParams = argParams,
-                                  auxParams = auxParams,
-                                  beginEpoch = beginEpoch,
-                                  epochSize = epochSize)
-      if (monitorSize > 0) {
-        model.setMonitor(new Monitor(monitorSize))
-      }
-      val acc = new Accuracy()
-      model.fit(trainData = train,
-                evalData = validation,
-                evalMetric = acc,
-                kvStore = kv,
-                batchEndCallback = new Speedometer(batchSize, 50),
-                epochEndCallback = checkpoint)
-      if (kv != null) {
-        kv.dispose()
-      }
-      acc
-    }
-  }
-  // scalastyle:on parameterNum
-}
-
-class Trainer
-
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/module/MnistMlp.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/module/MnistMlp.scala
deleted file mode 100644
index 839f6ac85902..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/module/MnistMlp.scala
+++ /dev/null
@@ -1,171 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.module
-
-import org.apache.mxnet._
-import org.apache.mxnet.module.{FitParams, Module}
-import org.apache.mxnet.DataDesc._
-import org.apache.mxnet.optimizer.SGD
-import org.kohsuke.args4j.{Option, CmdLineParser}
-import org.slf4j.LoggerFactory
-
-import scala.collection.JavaConverters._
-
-object MnistMlp {
-  private val logger = LoggerFactory.getLogger(classOf[MnistMlp])
-
-  def getSymbol: Symbol = {
-    val data = Symbol.Variable("data")
-    val fc1 = Symbol.FullyConnected(name = "fc1")(data)(Map("num_hidden" -> 128))
-    val act1 = Symbol.Activation(name = "relu1")(fc1)(Map("act_type" -> "relu"))
-    val fc2 = Symbol.FullyConnected(name = "fc2")(act1)(Map("num_hidden" -> 64))
-    val act2 = Symbol.Activation(name = "relu2")(fc2)(Map("act_type" -> "relu"))
-    val fc3 = Symbol.FullyConnected(name = "fc3")(act2)(Map("num_hidden" -> 10))
-    val softmax = Symbol.SoftmaxOutput(name = "softmax")(fc3)()
-    softmax
-  }
-
-  def runIntermediateLevelApi(train: DataIter, eval: DataIter,
-      cmdLine: MnistMlp, loadModelEpoch: Int = -1): Unit = {
-    // Intermediate-level API
-    val mod = if (loadModelEpoch == -1) {
-      new Module(getSymbol)
-    } else {
-      logger.info("Load checkpoint from epoch {}", loadModelEpoch)
-      Module.loadCheckpoint("model/mnist_mlp", loadModelEpoch, loadOptimizerStates = true)
-    }
-    mod.bind(dataShapes = train.provideDataDesc, labelShapes = Some(train.provideLabelDesc))
-    mod.initParams()
-    mod.initOptimizer(optimizer = new SGD(learningRate = 0.01f, momentum = 0.9f))
-
-    val metric = new Accuracy()
-
-    for (epoch <- 0 until cmdLine.numEpoch) {
-      while (train.hasNext) {
-        val batch = train.next()
-        mod.forward(batch)
-        mod.updateMetric(metric, batch.label)
-        mod.backward()
-        mod.update()
-      }
-
-      mod.saveCheckpoint("model/mnist_mlp", epoch, saveOptStates = true)
-
-      val (name, value) = metric.get
-      name.zip(value).foreach { case (n, v) =>
-        logger.info(s"epoch $epoch $n=$v")
-      }
-      metric.reset()
-      train.reset()
-    }
-  }
-
-  def runHighLevelApi(train: DataIter, test: DataIter, cmdLine: MnistMlp): Unit = {
-    // High-level API
-    train.reset()
-    val mod = new Module(getSymbol)
-    mod.fit(train, evalData = scala.Option(test), numEpoch = cmdLine.numEpoch)
-
-    // prediction iterator API
-    var iBatch = 0
-    test.reset()
-    while (test.hasNext) {
-      val batch = test.next()
-      val preds = mod.predict(batch)
-      val predLabel: Array[Int] = NDArray.argmax_channel(preds(0)).toArray.map(_.toInt)
-      val label = batch.label(0).toArray.map(_.toInt)
-      val acc = predLabel.zip(label).map { case (py, y) =>
-        if (py == y) 1 else 0
-      }.sum / predLabel.length.toFloat
-      if (iBatch % 20 == 0) {
-        logger.info(s"Batch $iBatch acc: $acc")
-      }
-      iBatch += 1
-    }
-
-    // a dummy call just to test if the API works
-    mod.predict(test)
-
-    // perform prediction and calculate accuracy manually
-    val preds = mod.predictEveryBatch(test)
-    test.reset()
-    var accSum = 0.0f
-    var accCnt = 0
-    var i = 0
-    while (test.hasNext) {
-      val batch = test.next()
-      val predLabel: Array[Int] = NDArray.argmax_channel(preds(i)(0)).toArray.map(_.toInt)
-      val label = batch.label(0).toArray.map(_.toInt)
-      accSum += (predLabel zip label).map { case (py, y) =>
-        if (py == y) 1 else 0
-      }.sum
-      accCnt += predLabel.length
-      i += 1
-    }
-    logger.info(s"Validation Accuracy: {}", accSum / accCnt.toFloat)
-
-    // evaluate on validation set with a evaluation metric
-    val (name, value) = mod.score(test, new Accuracy).get
-    logger.info("Scored {} = {}", name(0), value(0))
-  }
-
-  def main(args: Array[String]): Unit = {
-    val inst = new MnistMlp
-    val parser = new CmdLineParser(inst)
-    try {
-      parser.parseArgument(args.toList.asJava)
-
-      val train = IO.MNISTIter(Map(
-        "image" -> (inst.dataDir + "train-images-idx3-ubyte"),
-        "label" -> (inst.dataDir + "train-labels-idx1-ubyte"),
-        "label_name" -> "softmax_label",
-        "input_shape" -> "(784,)",
-        "batch_size" -> inst.batchSize.toString,
-        "shuffle" -> "True",
-        "flat" -> "True", "silent" -> "False", "seed" -> "10"))
-      val eval = IO.MNISTIter(Map(
-        "image" -> (inst.dataDir + "t10k-images-idx3-ubyte"),
-        "label" -> (inst.dataDir + "t10k-labels-idx1-ubyte"),
-        "label_name" -> "softmax_label",
-        "input_shape" -> "(784,)",
-        "batch_size" -> inst.batchSize.toString,
-        "flat" -> "True", "silent" -> "False"))
-
-      logger.info("Run intermediate level api from beginning.")
-      runIntermediateLevelApi(train, eval, inst)
-      logger.info("Run intermediate level api, start with last trained epoch.")
-      runIntermediateLevelApi(train, eval, inst, loadModelEpoch = inst.numEpoch - 1)
-      logger.info("Run high level api")
-      runHighLevelApi(train, eval, inst)
-    } catch {
-      case ex: Exception =>
-        logger.error(ex.getMessage, ex)
-        parser.printUsage(System.err)
-        sys.exit(1)
-    }
-  }
-}
-
-class MnistMlp {
-  @Option(name = "--data-dir", usage = "the input data directory")
-  private val dataDir: String = "mnist/"
-  @Option(name = "--batch-size", usage = "the batch size for data iterator")
-  private val batchSize: Int = 2
-  @Option(name = "--num-epoch", usage = "number of training epoches")
-  private val numEpoch: Int = 10
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/module/SequentialModuleEx.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/module/SequentialModuleEx.scala
deleted file mode 100644
index ea2273ebd796..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/module/SequentialModuleEx.scala
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.module
-
-import org.apache.mxnet._
-import org.apache.mxnet.module.{FitParams, Module, SequentialModule}
-import org.apache.mxnet.DataDesc._
-import org.apache.mxnet.optimizer.SGD
-import org.kohsuke.args4j.{Option, CmdLineParser}
-import org.slf4j.LoggerFactory
-import scala.collection.JavaConverters._
-
-object SequentialModuleEx {
-  private val logger = LoggerFactory.getLogger(classOf[SequentialModuleEx])
-
-  def getSeqModule(): SequentialModule = {
-    val contexts = Array(Context.cpu(), Context.cpu())
-
-    // module1
-    val data = Symbol.Variable("data")
-    val fc1 = Symbol.FullyConnected("fc1")()(Map("data" -> data, "num_hidden" -> 128))
-    val act1 = Symbol.Activation("relu1")()(Map("data" -> fc1, "act_type" -> "relu"))
-
-    val mod1 = new Module(act1, labelNames = null, contexts = contexts(0))
-
-    // module2
-    val data2 = Symbol.Variable("data")
-    val fc2 = Symbol.FullyConnected("fc2")()(Map("data" -> data2, "num_hidden" -> 64))
-    val act2 = Symbol.Activation("relu2")()(Map("data" -> fc2, "act_type" -> "relu"))
-    val fc3 = Symbol.FullyConnected("fc3")()(Map("data" -> act2, "num_hidden" -> 10))
-    val softmax = Symbol.SoftmaxOutput("softmax")()(Map("data" -> fc3))
-
-    val mod2 = new Module(softmax, contexts = contexts(1))
-
-    // Container module
-    val modSeq = new SequentialModule()
-    modSeq.add(mod1).add(mod2, ("take_labels", true), ("auto_wiring", true))
-    modSeq
-  }
-
-  def runIntermediateLevelApi(train: DataIter, eval: DataIter,
-    cmdLine: SequentialModuleEx): Unit = {
-    // Intermediate-level API
-    val modSeq = getSeqModule()
-    modSeq.bind(dataShapes = train.provideDataDesc, labelShapes = Some(train.provideLabelDesc))
-    if (cmdLine.loadModelPath != null) {
-      logger.info(s"Load checkpoint from ${cmdLine.loadModelPath}")
-      modSeq.loadParams(cmdLine.loadModelPath)
-     } else modSeq.initParams()
-
-     modSeq.initOptimizer(optimizer = new SGD(learningRate = cmdLine.lr, momentum = 0.9f))
-
-    val metric = new Accuracy()
-
-    for (epoch <- 0 until cmdLine.numEpoch) {
-      while (train.hasNext) {
-        val batch = train.next()
-        modSeq.forward(batch)
-        modSeq.updateMetric(metric, batch.label)
-        modSeq.backward()
-        modSeq.update()
-      }
-
-      val fname = "%s-%04d.params".format(s"${cmdLine.saveModelPath}/seqModule", epoch)
-      modSeq.saveParams(fname)
-
-      val (name, value) = metric.get
-      logger.info(s"epoch $epoch $name=$value")
-      metric.reset()
-      train.reset()
-    }
-  }
-
-  def runHighLevelApi(train: DataIter, test: DataIter, cmdLine: SequentialModuleEx): Unit = {
-    // High-level API
-    train.reset()
-    val modSeq = getSeqModule()
-    val fitParams = new FitParams
-    fitParams.setOptimizer(new SGD(learningRate = cmdLine.lr, momentum = 0.9f))
-    modSeq.fit(train, evalData = scala.Option(test),
-        numEpoch = cmdLine.numEpoch, fitParams = fitParams)
-
-    // prediction iterator API
-    var iBatch = 0
-    test.reset()
-    while (test.hasNext) {
-      val batch = test.next()
-      val preds = modSeq.predict(batch)
-      val predLabel: Array[Int] = NDArray.argmax_channel(preds(0)).toArray.map(_.toInt)
-      val label = batch.label(0).toArray.map(_.toInt)
-      val acc = predLabel.zip(label).map { case (py, y) =>
-        if (py == y) 1 else 0
-      }.sum / predLabel.length.toFloat
-      if (iBatch % 20 == 0) {
-        logger.info(s"Batch $iBatch acc: $acc")
-      }
-      iBatch += 1
-    }
-
-    // a dummy call just to test if the API works
-    modSeq.predict(test)
-
-    // perform prediction and calculate accuracy manually
-    val preds = modSeq.predictEveryBatch(test)
-    test.reset()
-    var accSum = 0.0f
-    var accCnt = 0
-    var i = 0
-    while (test.hasNext) {
-      val batch = test.next()
-      val predLabel: Array[Int] = NDArray.argmax_channel(preds(i)(0)).toArray.map(_.toInt)
-      val label = batch.label(0).toArray.map(_.toInt)
-      accSum += (predLabel zip label).map { case (py, y) =>
-        if (py == y) 1 else 0
-      }.sum
-      accCnt += predLabel.length
-      i += 1
-    }
-    logger.info(s"Validation Accuracy: ${accSum / accCnt.toFloat}")
-
-    // evaluate on validation set with a evaluation metric
-    val (name, value) = modSeq.score(test, new Accuracy).get
-    logger.info(s"Scored $name = $value")
-  }
-
-  def main(args: Array[String]): Unit = {
-    val alex = new SequentialModuleEx
-    val parser = new CmdLineParser(alex)
-    try {
-      parser.parseArgument(args.toList.asJava)
-      require(alex.dataDir != null)
-
-      val trainDataIter = IO.MNISTIter(Map(
-        "image" -> s"${alex.dataDir}/train-images-idx3-ubyte",
-        "label" -> s"${alex.dataDir}/train-labels-idx1-ubyte",
-        "label_name" -> "softmax_label",
-        "input_shape" -> "(784,)",
-        "batch_size" -> alex.batchSize.toString,
-        "shuffle" -> "True",
-        "flat" -> "True", "silent" -> "False", "seed" -> "10"))
-      val evalDataIter = IO.MNISTIter(Map(
-        "image" -> s"${alex.dataDir}/t10k-images-idx3-ubyte",
-        "label" -> s"${alex.dataDir}/t10k-labels-idx1-ubyte",
-        "label_name" -> "softmax_label",
-        "input_shape" -> "(784,)",
-        "batch_size" -> alex.batchSize.toString,
-        "flat" -> "True", "silent" -> "False"))
-
-      logger.info("Run intermediate level api from beginning.")
-      runIntermediateLevelApi(trainDataIter, evalDataIter, alex)
-      logger.info("Run high level api")
-      runHighLevelApi(trainDataIter, evalDataIter, alex)
-
-    } catch {
-      case ex: Exception =>
-        logger.error(ex.getMessage, ex)
-        parser.printUsage(System.err)
-        sys.exit(1)
-    }
-  }
-}
-
-class SequentialModuleEx {
-  @Option(name = "--data-dir", usage = "the input data directory")
-  private val dataDir: String = null
-  @Option(name = "--lr", usage = "the initial learning rate")
-  private val lr: Float = 0.01f
-  @Option(name = "--batch-size", usage = "the batch size for data iterator")
-  private val batchSize: Int = 100
-  @Option(name = "--num-epoch", usage = "number of training epoches")
-  private val numEpoch: Int = 100
-  @Option(name = "--save-model-path", usage = "the model saving path")
-  private val saveModelPath: String = ""
-  @Option(name = "--load-model-path", usage = "the model to be loaded")
-  private val loadModelPath: String = null
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/multitask/ExampleMultiTask.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/multitask/ExampleMultiTask.scala
deleted file mode 100644
index e406c6d21d23..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/multitask/ExampleMultiTask.scala
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.multitask
-
-import java.io.File
-import java.net.URL
-
-import org.kohsuke.args4j.{CmdLineParser, Option}
-import org.slf4j.LoggerFactory
-
-import scala.collection.JavaConverters._
-import org.apache.commons.io.FileUtils
-import org.apache.mxnet.{Context, DataBatch, DataDesc, DataIter, EvalMetric, Executor, NDArray, ResourceScope, Shape, Symbol, Xavier}
-import org.apache.mxnet.DType.DType
-import org.apache.mxnet.optimizer.RMSProp
-import org.apache.mxnetexamples.Util
-
-import scala.collection.immutable.ListMap
-import scala.language.postfixOps
-import scala.sys.process.Process
-
-/**
- * Example of multi-task
- */
-object ExampleMultiTask {
-  private val logger = LoggerFactory.getLogger(classOf[ExampleMultiTask])
-
-  def buildNetwork(): Symbol = {
-    val data = Symbol.Variable("data")
-    val fc1 = Symbol.api.FullyConnected(data = Some(data), num_hidden = 128)
-    val act1 = Symbol.api.Activation(data = Some(fc1), act_type = "relu")
-    val fc2 = Symbol.api.FullyConnected(data = Some(act1), num_hidden = 64)
-    val act2 = Symbol.api.Activation(data = Some(fc2), act_type = "relu")
-    val fc3 = Symbol.api.FullyConnected(data = Some(act2), num_hidden = 10)
-    val sm1 = Symbol.api.SoftmaxOutput(data = Some(fc3))
-    val sm2 = Symbol.api.SoftmaxOutput(data = Some(fc3))
-
-    val softmax = Symbol.Group(sm1, sm2)
-
-    softmax
-  }
-
-  // multi label mnist iterator
-  class MultiMnistIterator(dataIter: DataIter) extends DataIter {
-
-    @throws(classOf[NoSuchElementException])
-    override def next(): DataBatch = {
-      if (hasNext) {
-        val batch = this.dataIter.next()
-        val label = batch.label(0)
-        new DataBatch(batch.data,
-          IndexedSeq(label, label),
-          batch.index,
-          batch.pad)
-      } else {
-        throw new NoSuchElementException
-      }
-    }
-
-    /**
-     * reset the iterator
-     */
-    override def reset(): Unit = this.dataIter.reset()
-
-    override def batchSize: Int = dataIter.batchSize
-
-    /**
-     * get data of current batch
-     * @return the data of current batch
-     */
-    override def getData(): IndexedSeq[NDArray] = this.dataIter.getData()
-
-    /**
-     * Get label of current batch
-     * @return the label of current batch
-     */
-    override def getLabel(): IndexedSeq[NDArray] = {
-      val label = this.dataIter.getLabel()(0)
-      IndexedSeq(label, label)
-    }
-
-    /**
-     * the index of current batch
-     * @return
-     */
-    override def getIndex(): IndexedSeq[Long] = this.dataIter.getIndex()
-
-    // The name and shape of label provided by this iterator
-    @deprecated("Use provideLabelDesc instead", "1.3.0")
-    override def provideLabel: ListMap[String, Shape] = {
-      val provideLabel = this.dataIter.provideLabel.toArray
-      // Different labels should be used here for actual application
-      ListMap("softmax1_label" -> provideLabel(0)._2,
-              "softmax2_label" -> provideLabel(0)._2)
-    }
-
-    // The name and shape of label provided by this iterator
-    override def provideLabelDesc: IndexedSeq[DataDesc] = {
-      val head = this.dataIter.provideLabelDesc(0)
-      // Different labels should be used here for actual application
-      IndexedSeq(
-        new DataDesc("softmax1_label", head.shape, head.dtype, head.layout),
-        new DataDesc("softmax2_label", head.shape, head.dtype, head.layout)
-      )
-    }
-
-    /**
-     * get the number of padding examples
-     * in current batch
-     * @return number of padding examples in current batch
-     */
-    override def getPad(): Int = this.dataIter.getPad()
-
-    // The name and shape of data provided by this iterator
-    @deprecated("Use provideDataDesc instead", "1.3.0")
-    override def provideData: ListMap[String, Shape] = this.dataIter.provideData
-
-    override def provideDataDesc: IndexedSeq[DataDesc] = this.dataIter.provideDataDesc
-
-    override def hasNext: Boolean = this.dataIter.hasNext
-  }
-
-  class MultiAccuracy(num: Int, name: String) {
-    require(num >= 1)
-
-    private var sumMetric: Array[Float] = new Array[Float](num)
-    private var numInst: Array[Int] = new Array[Int](num)
-
-    def update(labels: IndexedSeq[NDArray], preds: IndexedSeq[NDArray]): Unit = {
-      require(labels.length == preds.length,
-        "labels and predictions should have the same length.")
-      assert(labels.length == num)
-
-      for (i <- labels.indices) {
-        val (pred, label) = (preds(i), labels(i))
-        val predLabel = NDArray.api.argmax_channel(data = pred)
-        require(label.shape == predLabel.shape,
-          s"label ${label.shape} and prediction ${predLabel.shape}" +
-          s"should have the same length.")
-        for ((labelElem, predElem) <- label.toArray zip predLabel.toArray) {
-          if (labelElem == predElem) {
-            this.sumMetric(i) += 1
-          }
-        }
-        this.numInst(i) += predLabel.shape(0)
-        predLabel.dispose()
-      }
-    }
-
-    def get(): Array[(String, Float)] = {
-      (0 until num).map( i => (this.name, this.sumMetric(i) / this.numInst(i))).toArray
-    }
-
-    def reset(): Unit = {
-      this.numInst = this.numInst.map(x => 0)
-      this.sumMetric = this.numInst.map(x => 0f)
-    }
-
-  }
-
-  class Speedometer(val batchSize: Int, val frequent: Int = 50) {
-    private val logger = LoggerFactory.getLogger(classOf[Speedometer])
-    private var init = false
-    private var tic: Long = 0L
-    private var lastCount: Int = 0
-
-    def invoke(epoch: Int, count: Int, evalMetric: MultiAccuracy): Unit = {
-      if (lastCount > count) {
-        init = false
-      }
-      lastCount = count
-
-      if (init) {
-        if (count % frequent == 0) {
-          val speed = frequent.toDouble * batchSize / (System.currentTimeMillis - tic) * 1000
-          if (evalMetric != null) {
-            val nameVals = evalMetric.get
-            nameVals.foreach { case (name, value) =>
-              logger.info("Epoch[%d] Batch [%d]\tSpeed: %.2f samples/sec\tTrain-%s=%f".format(
-                  epoch, count, speed, name, value))
-            }
-          } else {
-            logger.info("Iter[%d] Batch [%d]\tSpeed: %.2f samples/sec".format(epoch, count, speed))
-          }
-          tic = System.currentTimeMillis
-        }
-      } else {
-        init = true
-        tic = System.currentTimeMillis
-      }
-    }
-  }
-
-  def getTrainingData: String = {
-    val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci"
-    val tempDirPath = System.getProperty("java.io.tmpdir")
-    val modelDirPath = tempDirPath + File.separator + "multitask/"
-    Util.downloadUrl(baseUrl + "/mnist/mnist.zip",
-      tempDirPath + "/multitask/mnist.zip")
-
-    // TODO: Need to confirm with Windows
-    Process("unzip " + tempDirPath + "/multitask/mnist.zip -d "
-      + tempDirPath + "/multitask/") !
-
-    modelDirPath
-  }
-
-  def train(batchSize: Int, numEpoch: Int, ctx: Context, modelDirPath: String):
-  (Executor, MultiAccuracy) = {
-    ResourceScope.using() {
-      val lr = 0.001f
-      val network = ExampleMultiTask.buildNetwork()
-      val (trainIter, valIter) =
-        Data.mnistIterator(modelDirPath, batchSize = batchSize, inputShape = Shape(784))
-      val trainMultiIt = new MultiMnistIterator(trainIter)
-      val valMultiIter = new MultiMnistIterator(valIter)
-
-      val datasAndLabels = trainMultiIt.provideDataDesc ++ trainMultiIt.provideLabelDesc
-
-      val (argShapes, outputShapes, auxShapes)
-      = network.inferShape(trainMultiIt.provideDataDesc.filter(_.name == "data"))
-      val initializer = new Xavier(factorType = "in", magnitude = 2.34f)
-
-      val argNames = network.listArguments
-      val argDict = argNames.zip(argShapes.map(NDArray.empty(_, ctx))).toMap
-
-      val gradDict = argNames.zip(argShapes).filter { case (name, shape) =>
-        !datasAndLabels.contains(name)
-      }.map(x => x._1 -> NDArray.empty(x._2, ctx)).toMap
-
-      argDict.foreach { case (name, ndArray) =>
-        if (!datasAndLabels.contains(name)) {
-          initializer.initWeight(name, ndArray)
-        }
-      }
-
-      val data = argDict("data")
-      val label1 = argDict("softmaxoutput0_label")
-      val label2 = argDict("softmaxoutput1_label")
-      val maxGradNorm = 0.5f
-      val executor = network.bind(ctx, argDict, gradDict)
-
-      val opt = new RMSProp(learningRate = lr, wd = 0.00001f)
-
-      val paramsGrads = gradDict.toList.zipWithIndex.map { case ((name, grad), idx) =>
-        (idx, name, grad, opt.createState(idx, argDict(name)))
-      }
-
-      val evalMetric = new ExampleMultiTask.MultiAccuracy(num = 2, name = "multi_accuracy")
-      val batchEndCallback = new ExampleMultiTask.Speedometer(batchSize, 50)
-
-      for (epoch <- 0 until numEpoch) {
-        // Training phase
-        val tic = System.currentTimeMillis
-        evalMetric.reset()
-        var nBatch = 0
-        var epochDone = false
-        // Iterate over training data.
-        trainMultiIt.reset()
-
-        while (!epochDone) {
-          var doReset = true
-          while (doReset && trainMultiIt.hasNext) {
-            val dataBatch = trainMultiIt.next()
-
-            data.set(dataBatch.data(0))
-            label1.set(dataBatch.label(0))
-            label2.set(dataBatch.label(1))
-
-            executor.forward(isTrain = true)
-            executor.backward()
-
-            val norm = Math.sqrt(paramsGrads.map { case (idx, name, grad, optimState) =>
-              val l2Norm = NDArray.api.norm(data = (grad / batchSize)).toScalar
-              l2Norm * l2Norm
-            }.sum).toFloat
-
-            paramsGrads.foreach { case (idx, name, grad, optimState) =>
-              if (norm > maxGradNorm) {
-                grad.set(grad.toArray.map(_ * (maxGradNorm / norm)))
-                opt.update(idx, argDict(name), grad, optimState)
-              } else opt.update(idx, argDict(name), grad, optimState)
-            }
-
-            // evaluate at end, so out_cpu_array can lazy copy
-            evalMetric.update(dataBatch.label, executor.outputs)
-
-            nBatch += 1
-            batchEndCallback.invoke(epoch, nBatch, evalMetric)
-          }
-          if (doReset) {
-            trainMultiIt.reset()
-          }
-          // this epoch is done
-          epochDone = true
-        }
-        var nameVals = evalMetric.get
-        nameVals.foreach { case (name, value) =>
-          logger.info(s"Epoch[$epoch] Train-$name=$value")
-        }
-        val toc = System.currentTimeMillis
-        logger.info(s"Epoch[$epoch] Time cost=${toc - tic}")
-
-        evalMetric.reset()
-        valMultiIter.reset()
-        while (valMultiIter.hasNext) {
-          val evalBatch = valMultiIter.next()
-
-          data.set(evalBatch.data(0))
-          label1.set(evalBatch.label(0))
-          label2.set(evalBatch.label(1))
-
-          executor.forward(isTrain = true)
-
-          evalMetric.update(evalBatch.label, executor.outputs)
-          evalBatch.dispose()
-        }
-
-        nameVals = evalMetric.get
-        nameVals.foreach { case (name, value) =>
-          logger.info(s"Epoch[$epoch] Validation-$name=$value")
-        }
-      }
-
-      (executor, evalMetric)
-    }
-  }
-
-  def main(args: Array[String]): Unit = {
-    val lesk = new ExampleMultiTask
-    val parser: CmdLineParser = new CmdLineParser(lesk)
-    try {
-      parser.parseArgument(args.toList.asJava)
-
-      val batchSize = 100
-      val numEpoch = 5
-      val ctx = if (lesk.gpu != -1) Context.gpu(lesk.gpu) else Context.cpu()
-
-      val modelPath = if (lesk.dataPath == null) lesk.dataPath else getTrainingData
-
-      val (executor, evalMetric) = train(batchSize, numEpoch, ctx, modelPath)
-      executor.dispose()
-
-    } catch {
-      case ex: Exception => {
-        logger.error(ex.getMessage, ex)
-        parser.printUsage(System.err)
-        sys.exit(1)
-      }
-    }
-  }
-}
-
-class ExampleMultiTask {
-  @Option(name = "--data-path", usage = "the mnist data path")
-  private val dataPath: String = null
-  @Option(name = "--gpu", usage = "which gpu card to use, default is -1, means using cpu")
-  private val gpu: Int = -1
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/BucketIo.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/BucketIo.scala
deleted file mode 100644
index 2648f9e3d6bb..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/BucketIo.scala
+++ /dev/null
@@ -1,271 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-package org.apache.mxnetexamples.rnn
-
-import org.apache.mxnet.DType.DType
-import org.apache.mxnet._
-import org.slf4j.LoggerFactory
-
-import scala.collection.immutable.ListMap
-import scala.collection.mutable.ArrayBuffer
-import scala.io.Source
-import scala.util.Random
-import scala.collection.mutable
-
-object BucketIo {
-
-  type Text2Id = (String, Map[String, Int]) => Array[Int]
-  type ReadContent = String => String
-
-  def defaultReadContent(path: String): String = {
-    Source.fromFile(path, "UTF-8").mkString.replaceAll("\\. |\n", " <eos> ")
-  }
-
-  def defaultBuildVocab(path: String): Map[String, Int] = {
-    val content = defaultReadContent(path).split(" ")
-    var idx = 1 // 0 is left for zero - padding
-    val vocab = mutable.Map.empty[String, Int]
-    vocab.put(" ", 0) // put a dummy element here so that len (vocab) is correct
-    content.foreach(word =>
-      if (word.length > 0 && !vocab.contains(word)) {
-        vocab.put(word, idx)
-        idx += 1
-      }
-    )
-    vocab.toMap
-  }
-
-  def defaultText2Id(sentence: String, theVocab: Map[String, Int]): Array[Int] = {
-    val words = {
-      val tmp = sentence.split(" ").filter(_.length() > 0)
-      for (w <- tmp) yield theVocab(w)
-    }
-    words
-  }
-
-  def defaultGenBuckets(sentences: Array[String], batchSize: Int,
-                        theVocab: Map[String, Int]): IndexedSeq[Int] = {
-    val lenDict = scala.collection.mutable.Map[Int, Int]()
-    var maxLen = -1
-    for (sentence <- sentences) {
-      val wordsLen = defaultText2Id(sentence, theVocab).length
-      if (wordsLen > 0) {
-        if (wordsLen > maxLen) {
-          maxLen = wordsLen
-        }
-        if (lenDict.contains(wordsLen)) {
-          lenDict(wordsLen) = lenDict(wordsLen) + 1
-        } else {
-          lenDict += wordsLen -> 1
-        }
-      }
-    }
-
-    var tl = 0
-    val buckets = ArrayBuffer[Int]()
-    lenDict.foreach {
-      case (l, n) =>
-        if (n + tl >= batchSize) {
-          buckets.append(l)
-          tl = 0
-        } else tl += n
-    }
-    if (tl  > 0) buckets.append(maxLen)
-    buckets
-  }
-
-  class BucketSentenceIter(
-      path: String,
-      vocab: Map[String, Int],
-      var buckets: IndexedSeq[Int],
-      _batchSize: Int,
-      private val initStates: IndexedSeq[(String, (Int, Int))],
-      seperateChar: String = " <eos> ",
-      text2Id: Text2Id = defaultText2Id,
-      readContent: ReadContent = defaultReadContent) extends DataIter {
-    private val logger = LoggerFactory.getLogger(classOf[BucketSentenceIter])
-
-    private val content = readContent(path)
-    private val sentences = content.split(seperateChar)
-
-    if (buckets.length == 0) {
-      buckets = defaultGenBuckets(sentences, batchSize, vocab)
-    }
-    buckets = buckets.sorted
-    // pre-allocate with the largest bucket for better memory sharing
-    private val _defaultBucketKey = (buckets(0) /: buckets.drop(1)) { (max, elem) =>
-      if (max < elem) elem else max
-    }
-    override def defaultBucketKey: AnyRef = _defaultBucketKey.asInstanceOf[AnyRef]
-    // we just ignore the sentence it is longer than the maximum
-    // bucket size here
-    private val data = buckets.indices.map(x => Array[Array[Float]]()).toArray
-    for (sentence <- sentences) {
-      val ids = text2Id(sentence, vocab)
-      if (ids.length > 0) {
-        import scala.util.control.Breaks._
-        breakable { buckets.indices.foreach { idx =>
-          if (buckets(idx) >= ids.length) {
-            data(idx) = data(idx) :+
-            (ids.map(_.toFloat) ++ Array.fill[Float](buckets(idx) - ids.length)(0f))
-            break()
-          }
-        }}
-      }
-    }
-
-    // Get the size of each bucket, so that we could sample
-    // uniformly from the bucket
-    private val bucketSizes = data.map(_.length)
-    logger.info("Summary of dataset ==================")
-    buckets.zip(bucketSizes).foreach {
-      case (bkt, size) => logger.info(s"bucket of len $bkt : $size samples")
-    }
-
-     // make a random data iteration plan
-     // truncate each bucket into multiple of batch-size
-    private var bucketNBatches = Array[Int]()
-    for (i <- data.indices) {
-      bucketNBatches = bucketNBatches :+ (data(i).length / _batchSize)
-      data(i) = data(i).take(bucketNBatches(i) * _batchSize)
-    }
-
-    private val bucketPlan = {
-      val plan = bucketNBatches.zipWithIndex.map(x => Array.fill[Int](x._1)(x._2)).flatten
-      Random.shuffle(plan.toList).toArray
-    }
-
-    private val bucketIdxAll = data.map(_.length).map(l =>
-      Random.shuffle((0 until l).toList).toArray)
-    private val bucketCurrIdx = data.map(x => 0)
-
-    private val dataBuffer = ArrayBuffer[NDArray]()
-    private val labelBuffer = ArrayBuffer[NDArray]()
-    for (iBucket <- data.indices) {
-      dataBuffer.append(NDArray.zeros(_batchSize, buckets(iBucket)))
-      labelBuffer.append(NDArray.zeros(_batchSize, buckets(iBucket)))
-    }
-
-    private val _provideData = { val tmp = ListMap("data" -> Shape(_batchSize, _defaultBucketKey))
-      tmp ++ initStates.map(x => x._1 -> Shape(x._2._1, x._2._2))
-    }
-
-    private val _provideLabel = ListMap("softmax_label" -> Shape(_batchSize, _defaultBucketKey))
-
-    private val _provideDataDesc = {
-      // TODO: need to allow user to specify DType and Layout
-      val tmp = IndexedSeq(new DataDesc("data",
-        Shape(_batchSize, _defaultBucketKey), DType.Float32, Layout.UNDEFINED))
-      tmp ++ initStates.map(x => new DataDesc(x._1, Shape(x._2._1, x._2._2),
-        DType.Float32, Layout.UNDEFINED))
-    }
-
-    private val _provideLabelDesc = IndexedSeq(
-      // TODO: need to allow user to specify DType and Layout
-      new DataDesc("softmax_label",
-      Shape(_batchSize, _defaultBucketKey), DType.Float32, Layout.UNDEFINED))
-
-    private var iBucket = 0
-
-    override def next(): DataBatch = {
-      if (!hasNext) throw new NoSuchElementException
-      val bucketIdx = bucketPlan(iBucket)
-      val dataBuf = dataBuffer(bucketIdx)
-      val iIdx = bucketCurrIdx(bucketIdx)
-      val idx = bucketIdxAll(bucketIdx).slice(iIdx, iIdx + _batchSize)
-      bucketCurrIdx(bucketIdx) = bucketCurrIdx(bucketIdx) + _batchSize
-
-      val datas = idx.map(i => data(bucketIdx)(i))
-      for (sentence <- datas) {
-        require(sentence.length == buckets(bucketIdx))
-      }
-      dataBuf.set(datas.flatten)
-
-      val labelBuf = labelBuffer(bucketIdx)
-      val labels = idx.map(i => data(bucketIdx)(i).drop(1) :+ 0f)
-      labelBuf.set(labels.flatten)
-
-      iBucket += 1
-      val batchProvideData = IndexedSeq(DataDesc("data", dataBuf.shape, dataBuf.dtype)) ++
-        initStates.map {
-          case (name, shape) => DataDesc(name, Shape(shape._1, shape._2), DType.Float32)}
-      val batchProvideLabel = IndexedSeq(DataDesc("softmax_label", labelBuf.shape, labelBuf.dtype))
-      val initStateArrays = initStates.map(x => NDArray.zeros(x._2._1, x._2._2))
-      new DataBatch(IndexedSeq(dataBuf.copy()) ++ initStateArrays,
-        IndexedSeq(labelBuf.copy()),
-        getIndex(),
-        getPad(),
-        this.buckets(bucketIdx).asInstanceOf[AnyRef],
-        batchProvideData, batchProvideLabel)
-    }
-
-    /**
-     * reset the iterator
-     */
-    override def reset(): Unit = {
-      iBucket = 0
-      bucketCurrIdx.indices.foreach(i => bucketCurrIdx(i) = 0)
-    }
-
-    override def batchSize: Int = _batchSize
-
-    /**
-     * get data of current batch
-     * @return the data of current batch
-     */
-    override def getData(): IndexedSeq[NDArray] = IndexedSeq(dataBuffer(bucketPlan(iBucket)))
-
-    /**
-     * Get label of current batch
-     * @return the label of current batch
-     */
-    override def getLabel(): IndexedSeq[NDArray] = IndexedSeq(labelBuffer(bucketPlan(iBucket)))
-
-    /**
-     * the index of current batch
-     * @return
-     */
-    override def getIndex(): IndexedSeq[Long] = IndexedSeq[Long]()
-
-    /**
-      * get the number of padding examples
-      * in current batch
-      * @return number of padding examples in current batch
-      */
-    override def getPad(): Int = 0
-
-    // The name and shape of label provided by this iterator
-    @deprecated("Use provideLabelDesc instead", "1.3.0")
-    override def provideLabel: ListMap[String, Shape] = this._provideLabel
-
-    // The name and shape of data provided by this iterator
-    @deprecated("Use provideDataDesc instead", "1.3.0")
-    override def provideData: ListMap[String, Shape] = this._provideData
-
-    // Provide type:DataDesc of the data
-    override def provideDataDesc: IndexedSeq[DataDesc] = _provideDataDesc
-
-    // Provide type:DataDesc of the label
-    override def provideLabelDesc: IndexedSeq[DataDesc] = _provideLabelDesc
-
-    override def hasNext: Boolean = {
-      iBucket < bucketPlan.length
-    }
-  }
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/Lstm.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/Lstm.scala
deleted file mode 100644
index 872ef7871fb0..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/Lstm.scala
+++ /dev/null
@@ -1,163 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-package org.apache.mxnetexamples.rnn
-
-import org.apache.mxnet.{Shape, Symbol}
-
-import scala.collection.mutable.ArrayBuffer
-
-object Lstm {
-
-  final case class LSTMState(c: Symbol, h: Symbol)
-  final case class LSTMParam(i2hWeight: Symbol, i2hBias: Symbol,
-                             h2hWeight: Symbol, h2hBias: Symbol)
-
-  // LSTM Cell symbol
-  def lstm(numHidden: Int, inData: Symbol, prevState: LSTMState,
-           param: LSTMParam, seqIdx: Int, layerIdx: Int, dropout: Float = 0f): LSTMState = {
-    val inDataa = {
-      if (dropout > 0f) Symbol.api.Dropout(data = Some(inData), p = Some(dropout))
-      else inData
-    }
-    val i2h = Symbol.api.FullyConnected(data = Some(inDataa), weight = Some(param.i2hWeight),
-      bias = Some(param.i2hBias), num_hidden = numHidden * 4, name = s"t${seqIdx}_l${layerIdx}_i2h")
-    val h2h = Symbol.api.FullyConnected(data = Some(prevState.h), weight = Some(param.h2hWeight),
-      bias = Some(param.h2hBias), num_hidden = numHidden * 4, name = s"t${seqIdx}_l${layerIdx}_h2h")
-    val gates = i2h + h2h
-    val sliceGates = Symbol.api.SliceChannel(data = Some(gates), num_outputs = 4,
-      name = s"t${seqIdx}_l${layerIdx}_slice")
-    val ingate = Symbol.api.Activation(data = Some(sliceGates.get(0)), act_type = "sigmoid")
-    val inTransform = Symbol.api.Activation(data = Some(sliceGates.get(1)), act_type = "tanh")
-    val forgetGate = Symbol.api.Activation(data = Some(sliceGates.get(2)), act_type = "sigmoid")
-    val outGate = Symbol.api.Activation(data = Some(sliceGates.get(3)), act_type = "sigmoid")
-    val nextC = (forgetGate * prevState.c) + (ingate * inTransform)
-    val nextH = outGate * Symbol.api.Activation(data = Some(nextC), "tanh")
-    LSTMState(c = nextC, h = nextH)
-  }
-
-  // we define a new unrolling function here because the original
-  // one in lstm.py concats all the labels at the last layer together,
-  // making the mini-batch size of the label different from the data.
-  // I think the existing data-parallelization code need some modification
-  // to allow this situation to work properly
-  def lstmUnroll(numLstmLayer: Int, seqLen: Int, inputSize: Int, numHidden: Int,
-                 numEmbed: Int, numLabel: Int, dropout: Float = 0f): Symbol = {
-    val embedWeight = Symbol.Variable("embed_weight")
-    val clsWeight = Symbol.Variable("cls_weight")
-    val clsBias = Symbol.Variable("cls_bias")
-
-    val paramCellsBuf = ArrayBuffer[LSTMParam]()
-    val lastStatesBuf = ArrayBuffer[LSTMState]()
-    for (i <- 0 until numLstmLayer) {
-      paramCellsBuf.append(LSTMParam(i2hWeight = Symbol.Variable(s"l${i}_i2h_weight"),
-        i2hBias = Symbol.Variable(s"l${i}_i2h_bias"),
-        h2hWeight = Symbol.Variable(s"l${i}_h2h_weight"),
-        h2hBias = Symbol.Variable(s"l${i}_h2h_bias")))
-      lastStatesBuf.append(LSTMState(c = Symbol.Variable(s"l${i}_init_c_beta"),
-        h = Symbol.Variable(s"l${i}_init_h_beta")))
-    }
-    val paramCells = paramCellsBuf.toArray
-    val lastStates = lastStatesBuf.toArray
-    require(lastStates.length == numLstmLayer)
-
-    // embeding layer
-    val data = Symbol.Variable("data")
-    var label = Symbol.Variable("softmax_label")
-    val embed = Symbol.api.Embedding(data = Some(data), input_dim = inputSize,
-      weight = Some(embedWeight), output_dim = numEmbed, name = "embed")
-    val wordvec = Symbol.api.SliceChannel(data = Some(embed),
-      num_outputs = seqLen, squeeze_axis = Some(true))
-
-    val hiddenAll = ArrayBuffer[Symbol]()
-    var dpRatio = 0f
-    var hidden: Symbol = null
-    for (seqIdx <- 0 until seqLen) {
-      hidden = wordvec.get(seqIdx)
-      // stack LSTM
-      for (i <- 0 until numLstmLayer) {
-        if (i == 0) dpRatio = 0f else dpRatio = dropout
-        val nextState = lstm(numHidden, inData = hidden,
-          prevState = lastStates(i),
-          param = paramCells(i),
-          seqIdx = seqIdx, layerIdx = i, dropout = dpRatio)
-        hidden = nextState.h
-        lastStates(i) = nextState
-      }
-      // decoder
-      if (dropout > 0f) hidden = Symbol.api.Dropout(data = Some(hidden), p = Some(dropout))
-      hiddenAll.append(hidden)
-    }
-    val hiddenConcat = Symbol.api.Concat(data = hiddenAll.toArray, num_args = hiddenAll.length,
-      dim = Some(0))
-    val pred = Symbol.api.FullyConnected(data = Some(hiddenConcat), num_hidden = numLabel,
-      weight = Some(clsWeight), bias = Some(clsBias))
-    label = Symbol.api.transpose(data = Some(label))
-    label = Symbol.api.Reshape(data = Some(label), target_shape = Some(Shape(0)))
-    val sm = Symbol.api.SoftmaxOutput(data = Some(pred), label = Some(label), name = "softmax")
-    sm
-  }
-
-  def lstmInferenceSymbol(numLstmLayer: Int, inputSize: Int, numHidden: Int,
-                          numEmbed: Int, numLabel: Int, dropout: Float = 0f): Symbol = {
-    val seqIdx = 0
-    val embedWeight = Symbol.Variable("embed_weight")
-    val clsWeight = Symbol.Variable("cls_weight")
-    val clsBias = Symbol.Variable("cls_bias")
-
-    var paramCells = Array[LSTMParam]()
-    var lastStates = Array[LSTMState]()
-    for (i <- 0 until numLstmLayer) {
-      paramCells = paramCells :+ LSTMParam(i2hWeight = Symbol.Variable(s"l${i}_i2h_weight"),
-        i2hBias = Symbol.Variable(s"l${i}_i2h_bias"),
-        h2hWeight = Symbol.Variable(s"l${i}_h2h_weight"),
-        h2hBias = Symbol.Variable(s"l${i}_h2h_bias"))
-      lastStates = lastStates :+ LSTMState(c = Symbol.Variable(s"l${i}_init_c_beta"),
-        h = Symbol.Variable(s"l${i}_init_h_beta"))
-    }
-    assert(lastStates.length == numLstmLayer)
-
-    val data = Symbol.Variable("data")
-
-    var hidden = Symbol.api.Embedding(data = Some(data), input_dim = inputSize,
-      weight = Some(embedWeight), output_dim = numEmbed, name = "embed")
-
-    var dpRatio = 0f
-    // stack LSTM
-    for (i <- 0 until numLstmLayer) {
-      if (i == 0) dpRatio = 0f else dpRatio = dropout
-      val nextState = lstm(numHidden, inData = hidden,
-        prevState = lastStates(i),
-        param = paramCells(i),
-        seqIdx = seqIdx, layerIdx = i, dropout = dpRatio)
-      hidden = nextState.h
-      lastStates(i) = nextState
-    }
-    // decoder
-    if (dropout > 0f) hidden = Symbol.api.Dropout(data = Some(hidden), p = Some(dropout))
-    val fc = Symbol.api.FullyConnected(data = Some(hidden),
-      num_hidden = numLabel, weight = Some(clsWeight), bias = Some(clsBias))
-    val sm = Symbol.api.SoftmaxOutput(data = Some(fc), name = "softmax")
-    var output = Array(sm)
-    for (state <- lastStates) {
-      output = output :+ state.c
-      output = output :+ state.h
-    }
-    Symbol.Group(output: _*)
-  }
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/LstmBucketing.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/LstmBucketing.scala
deleted file mode 100644
index d1a70a755b01..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/LstmBucketing.scala
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-package org.apache.mxnetexamples.rnn
-
-import org.apache.mxnet.Callback.Speedometer
-import org.apache.mxnet._
-import BucketIo.BucketSentenceIter
-import org.apache.mxnet.optimizer.SGD
-import org.kohsuke.args4j.{CmdLineParser, Option}
-import org.slf4j.{Logger, LoggerFactory}
-
-import scala.collection.JavaConverters._
-import org.apache.mxnet.module.BucketingModule
-import org.apache.mxnet.module.FitParams
-
-/**
-  * Bucketing LSTM examples
-  */
-class LstmBucketing {
-  @Option(name = "--data-train", usage = "training set")
-  private val dataTrain: String = "example/rnn/sherlockholmes.train.txt"
-  @Option(name = "--data-val", usage = "validation set")
-  private val dataVal: String = "example/rnn/sherlockholmes.valid.txt"
-  @Option(name = "--num-epoch", usage = "the number of training epoch")
-  private val numEpoch: Int = 5
-  @Option(name = "--gpus", usage = "the gpus will be used, e.g. '0,1,2,3'")
-  private val gpus: String = null
-  @Option(name = "--cpus", usage = "the cpus will be used, e.g. '0,1,2,3'")
-  private val cpus: String = null
-  @Option(name = "--save-model-path", usage = "the model saving path")
-  private val saveModelPath: String = "model/lstm"
-}
-
-object LstmBucketing {
-  private val logger: Logger = LoggerFactory.getLogger(classOf[LstmBucketing])
-
-  def perplexity(label: NDArray, pred: NDArray): Float = {
-    pred.waitToRead()
-    val labelArr = label.T.toArray.map(_.toInt)
-    var loss = .0
-    (0 until pred.shape(0)).foreach(i =>
-      loss -= Math.log(Math.max(1e-10f, pred.slice(i).toArray(labelArr(i))))
-    )
-    Math.exp(loss / labelArr.length).toFloat
-  }
-
-  def runTraining(trainData : String, validationData : String,
-                  ctx : Array[Context], numEpoch : Int): Unit = {
-    ResourceScope.using() {
-      val batchSize = 32
-      val buckets = Array(10, 20, 30, 40, 50, 60)
-      val numHidden = 200
-      val numEmbed = 200
-      val numLstmLayer = 2
-
-      logger.info("Building vocab ...")
-      val vocab = BucketIo.defaultBuildVocab(trainData)
-
-      def BucketSymGen(key: AnyRef):
-      (Symbol, IndexedSeq[String], IndexedSeq[String]) = {
-        val seqLen = key.asInstanceOf[Int]
-        val sym = Lstm.lstmUnroll(numLstmLayer, seqLen, vocab.size,
-          numHidden = numHidden, numEmbed = numEmbed, numLabel = vocab.size)
-        (sym, IndexedSeq("data"), IndexedSeq("softmax_label"))
-      }
-
-      val initC = (0 until numLstmLayer).map(l =>
-        (s"l${l}_init_c_beta", (batchSize, numHidden))
-      )
-      val initH = (0 until numLstmLayer).map(l =>
-        (s"l${l}_init_h_beta", (batchSize, numHidden))
-      )
-      val initStates = initC ++ initH
-
-      val dataTrain = new BucketSentenceIter(trainData, vocab,
-        buckets, batchSize, initStates)
-      val dataVal = new BucketSentenceIter(validationData, vocab,
-        buckets, batchSize, initStates)
-
-      val model = new BucketingModule(
-        symGen = BucketSymGen,
-        defaultBucketKey = dataTrain.defaultBucketKey,
-        contexts = ctx)
-
-      val fitParams = new FitParams()
-      fitParams.setEvalMetric(
-        new CustomMetric(perplexity, name = "perplexity"))
-      fitParams.setKVStore("device")
-      fitParams.setOptimizer(
-        new SGD(learningRate = 0.01f, momentum = 0f, wd = 0.00001f))
-      fitParams.setInitializer(new Xavier(factorType = "in", magnitude = 2.34f))
-      fitParams.setBatchEndCallback(new Speedometer(batchSize, 50))
-
-      logger.info("Start training ...")
-      model.fit(
-        trainData = dataTrain,
-        evalData = Some(dataVal),
-        numEpoch = numEpoch, fitParams)
-      logger.info("Finished training...")
-    }
-  }
-
-  def main(args: Array[String]): Unit = {
-    val inst = new LstmBucketing
-    val parser: CmdLineParser = new CmdLineParser(inst)
-    try {
-      parser.parseArgument(args.toList.asJava)
-      val contexts =
-        if (inst.gpus != null) inst.gpus.split(',').map(id => Context.gpu(id.trim.toInt))
-        else if (inst.cpus != null) inst.cpus.split(',').map(id => Context.cpu(id.trim.toInt))
-        else Array(Context.cpu(0))
-
-      runTraining(inst.dataTrain, inst.dataVal, contexts, 5)
-    } catch {
-      case ex: Exception =>
-        logger.error(ex.getMessage, ex)
-        parser.printUsage(System.err)
-        sys.exit(1)
-    }
-  }
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/README.md b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/README.md
deleted file mode 100644
index 2ef2f47c7983..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/README.md
+++ /dev/null
@@ -1,65 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-# RNN Example for MXNet Scala
-This folder contains the following examples writing in new Scala type-safe API:
-- [x] LSTM Bucketing
-- [x] CharRNN Inference : Generate similar text based on the model
-- [x] CharRNN Training: Training the language model using RNN
-
-These example is only for Illustration and not modeled to achieve the best accuracy.
-
-## Setup
-### Download the Network Definition, Weights and Training Data
-`obama.zip` contains the training inputs (Obama's speech) for CharCNN examples and `sherlockholmes` contains the data for LSTM Bucketing
-```bash
-https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/RNN/obama.zip
-https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/RNN/sherlockholmes.train.txt
-https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/RNN/sherlockholmes.valid.txt
-```
-### Unzip the file
-```bash
-unzip obama.zip
-```
-### Arguement Configuration
-Then you need to define the arguments that you would like to pass in the model:
-
-#### LSTM Bucketing
-```bash
---data-train
-<path>/sherlockholmes.train.txt
---data-val
-<path>/sherlockholmes.valid.txt
---cpus
-<num_cpus>
---gpus
-<num_gpu>
-```
-#### TrainCharRnn
-```bash
---data-path
-<path>/obama.txt
---save-model-path
-<path>/
-```
-#### TestCharRnn
-```bash
---data-path
-<path>/obama.txt
---model-prefix
-<path>/obama
-```
\ No newline at end of file
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/RnnModel.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/RnnModel.scala
deleted file mode 100644
index 3cd79f46dcc1..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/RnnModel.scala
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-
-package org.apache.mxnetexamples.rnn
-
-import org.apache.mxnet.Context
-import org.apache.mxnet.NDArray
-import org.apache.mxnet.Shape
-import org.apache.mxnet.Symbol
-
-object RnnModel {
-  class LSTMInferenceModel(numLstmLayer: Int, inputSize: Int, numHidden: Int,
-                           numEmbed: Int, numLabel: Int, argParams: Map[String, NDArray],
-                           ctx: Context = Context.cpu(), dropout: Float = 0f) {
-    private val sym = Lstm.lstmInferenceSymbol(numLstmLayer,
-                                               inputSize,
-                                               numHidden,
-                                               numEmbed,
-                                               numLabel,
-                                               dropout)
-    private val batchSize = 1
-    private val initC = (for (l <- 0 until numLstmLayer)
-                          yield (s"l${l}_init_c_beta" -> Shape(batchSize, numHidden))).toMap
-    private val initH = (for (l <- 0 until numLstmLayer)
-                          yield (s"l${l}_init_h_beta" -> Shape(batchSize, numHidden))).toMap
-    private val dataShape = Map("data" -> Shape(batchSize))
-    private val inputShape = initC ++ initH ++ dataShape
-    private val executor = sym.simpleBind(ctx = ctx, shapeDict = inputShape)
-
-    for (key <- this.executor.argDict.keys) {
-      if (!inputShape.contains(key) && argParams.contains(key) && key != "softmax_label") {
-        argParams(key).copyTo(this.executor.argDict(key))
-      }
-    }
-
-    private var stateName = (Array[String]() /: (0 until numLstmLayer)) { (acc, i) =>
-      acc :+ s"l${i}_init_c_beta"  :+ s"l${i}_init_h_beta"
-    }
-
-    private val statesDict = stateName.zip(this.executor.outputs.drop(1)).toMap
-    private val inputArr = NDArray.zeros(dataShape("data"))
-
-    def forward(inputData: NDArray, newSeq: Boolean = false): Array[Float] = {
-      if (newSeq == true) {
-        for (key <- this.statesDict.keys) {
-          this.executor.argDict(key).set(0f)
-        }
-      }
-      inputData.copyTo(this.executor.argDict("data"))
-      this.executor.forward()
-      for (key <- this.statesDict.keys) {
-        this.statesDict(key).copyTo(this.executor.argDict(key))
-      }
-      val prob = this.executor.outputs(0).toArray
-      prob
-    }
-  }
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/TestCharRnn.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/TestCharRnn.scala
deleted file mode 100644
index 750fd9837e53..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/TestCharRnn.scala
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.rnn
-
-import org.apache.mxnet._
-import org.apache.mxnetexamples.InferBase
-import org.apache.mxnetexamples.benchmark.CLIParserBase
-import org.kohsuke.args4j.{CmdLineParser, Option}
-import org.slf4j.LoggerFactory
-
-import scala.collection.JavaConverters._
-
-/**
- * Follows the demo, to test the char rnn:
- * https://github.com/dmlc/mxnet/blob/master/example/rnn/char-rnn.ipynb
- */
-object TestCharRnn {
-
-  private val logger = LoggerFactory.getLogger(classOf[TrainCharRnn])
-
-  def runInferenceCharRNN(dataPath: String, modelPrefix: String, starterSentence : String): Unit = {
-    ResourceScope.using() {
-      // The batch size for training
-      val batchSize = 32
-      // We can support various length input
-      // For this problem, we cut each input sentence to length of 129
-      // So we only need fix length bucket
-      val buckets = List(129)
-      // hidden unit in LSTM cell
-      val numHidden = 512
-      // embedding dimension, which is, map a char to a 256 dim vector
-      val numEmbed = 256
-      // number of lstm layer
-      val numLstmLayer = 3
-
-      // build char vocabluary from input
-      val vocab = Utils.buildVocab(dataPath)
-
-      // load from check-point
-      val (_, argParams, _) = Model.loadCheckpoint(modelPrefix, 75)
-
-      // build an inference model
-      val model = new RnnModel.LSTMInferenceModel(numLstmLayer, vocab.size + 1,
-        numHidden = numHidden, numEmbed = numEmbed,
-        numLabel = vocab.size + 1, argParams = argParams, dropout = 0.2f)
-
-      // generate a sequence of 1200 chars
-      val seqLength = 1200
-      val inputNdarray = NDArray.zeros(1)
-      val revertVocab = Utils.makeRevertVocab(vocab)
-
-      // Feel free to change the starter sentence
-      var output = starterSentence
-      val randomSample = true
-      var newSentence = true
-      val ignoreLength = output.length()
-
-      for (i <- 0 until seqLength) {
-        if (i <= ignoreLength - 1) Utils.makeInput(output(i), vocab, inputNdarray)
-        else Utils.makeInput(output.takeRight(1)(0), vocab, inputNdarray)
-        val prob = model.forward(inputNdarray, newSentence)
-        newSentence = false
-        val nextChar = Utils.makeOutput(prob, revertVocab, randomSample)
-        if (nextChar == "") newSentence = true
-        if (i >= ignoreLength) output = output ++ nextChar
-      }
-
-      // Let's see what we can learned from char in Obama's speech.
-      logger.info(output)
-    }
-  }
-
-  def main(args: Array[String]): Unit = {
-    val stcr = new CLIParser
-    val parser: CmdLineParser = new CmdLineParser(stcr)
-    try {
-      parser.parseArgument(args.toList.asJava)
-      assert(stcr.dataPath != null && stcr.modelPrefix != null && stcr.starterSentence != null)
-      runInferenceCharRNN(stcr.dataPath, stcr.modelPrefix, stcr.starterSentence)
-    } catch {
-      case ex: Exception => {
-        logger.error(ex.getMessage, ex)
-        parser.printUsage(System.err)
-        sys.exit(1)
-      }
-    }
-  }
-}
-
-class CLIParser extends CLIParserBase {
-  @Option(name = "--data-path", usage = "the input train data file")
-  val dataPath: String = "./data/obama.txt"
-  @Option(name = "--model-prefix", usage = "the model prefix")
-  val modelPrefix: String = "./model/obama"
-  @Option(name = "--starter-sentence", usage = "the starter sentence")
-  val starterSentence: String = "The joke"
-}
-
-class TestCharRnn(CLIParser: CLIParser) extends InferBase {
-
-  private var vocab : Map[String, Int] = null
-
-  override def loadModel(context: Array[Context], batchInference : Boolean = false): Any = {
-    val batchSize = 32
-    val buckets = List(129)
-    val numHidden = 512
-    val numEmbed = 256
-    val numLstmLayer = 3
-    val (_, argParams, _) = Model.loadCheckpoint(CLIParser.modelPrefix, 75)
-    this.vocab = Utils.buildVocab(CLIParser.dataPath)
-    var ctx = Context.cpu()
-    if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
-      System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
-      ctx = Context.gpu()
-    }
-    val model = new RnnModel.LSTMInferenceModel(numLstmLayer, vocab.size + 1,
-      numHidden = numHidden, numEmbed = numEmbed,
-      numLabel = vocab.size + 1, argParams = argParams, dropout = 0.2f, ctx = ctx)
-    model
-  }
-
-  override def loadSingleData(): Any = {
-    val revertVocab = Utils.makeRevertVocab(vocab)
-    revertVocab
-  }
-
-  override def runSingleInference(loadedModel: Any, input: Any): Any = {
-    val model = loadedModel.asInstanceOf[RnnModel.LSTMInferenceModel]
-    val revertVocab = input.asInstanceOf[Map[Int, String]]
-    // generate a sequence of 1200 chars
-    val seqLength = 1200
-    val inputNdarray = NDArray.zeros(1)
-    // Feel free to change the starter sentence
-    var output = CLIParser.starterSentence
-    val randomSample = true
-    var newSentence = true
-    val ignoreLength = output.length()
-
-    for (i <- 0 until seqLength) {
-      if (i <= ignoreLength - 1) Utils.makeInput(output(i), vocab, inputNdarray)
-      else Utils.makeInput(output.takeRight(1)(0), vocab, inputNdarray)
-      val prob = model.forward(inputNdarray, newSentence)
-      newSentence = false
-      val nextChar = Utils.makeOutput(prob, revertVocab, randomSample)
-      if (nextChar == "") newSentence = true
-      if (i >= ignoreLength) output = output ++ nextChar
-    }
-    output
-  }
-
-  override def loadBatchFileList(batchSize: Int): List[Any] = null
-  override def loadInputBatch(source: Any): Any = null
-  override def runBatchInference(loadedModel: Any, input: Any): Any = null
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/TrainCharRnn.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/TrainCharRnn.scala
deleted file mode 100644
index 2704715b0c4d..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/TrainCharRnn.scala
+++ /dev/null
@@ -1,185 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.rnn
-
-import org.apache.mxnet._
-import org.kohsuke.args4j.{CmdLineParser, Option}
-import org.slf4j.LoggerFactory
-import scala.collection.JavaConverters._
-import org.apache.mxnet.optimizer.Adam
-
-/**
-  * Follows the demo, to train the char rnn:
-  * https://github.com/dmlc/mxnet/blob/master/example/rnn/char-rnn.ipynb
-  */
-object TrainCharRnn {
-
-  private val logger = LoggerFactory.getLogger(classOf[TrainCharRnn])
-
-  def runTrainCharRnn(dataPath: String, saveModelPath: String,
-                      ctx : Context, numEpoch : Int): Unit = {
-    ResourceScope.using() {
-      // The batch size for training
-      val batchSize = 32
-      // We can support various length input
-      // For this problem, we cut each input sentence to length of 129
-      // So we only need fix length bucket
-      val buckets = Array(129)
-      // hidden unit in LSTM cell
-      val numHidden = 512
-      // embedding dimension, which is, map a char to a 256 dim vector
-      val numEmbed = 256
-      // number of lstm layer
-      val numLstmLayer = 3
-      // we will show a quick demo in 2 epoch
-      // learning rate
-      val learningRate = 0.001f
-      // we will use pure sgd without momentum
-      val momentum = 0.0f
-
-      val vocab = Utils.buildVocab(dataPath)
-
-      // generate symbol for a length
-      def symGen(seqLen: Int): Symbol = {
-        Lstm.lstmUnroll(numLstmLayer, seqLen, vocab.size + 1,
-          numHidden = numHidden, numEmbed = numEmbed,
-          numLabel = vocab.size + 1, dropout = 0.2f)
-      }
-
-      // initalize states for LSTM
-      val initC = for (l <- 0 until numLstmLayer)
-        yield (s"l${l}_init_c_beta", (batchSize, numHidden))
-      val initH = for (l <- 0 until numLstmLayer)
-        yield (s"l${l}_init_h_beta", (batchSize, numHidden))
-      val initStates = initC ++ initH
-
-      val dataTrain = new BucketIo.BucketSentenceIter(dataPath, vocab, buckets,
-        batchSize, initStates, seperateChar = "\n",
-        text2Id = Utils.text2Id, readContent = Utils.readContent)
-
-      // the network symbol
-      val symbol = symGen(buckets(0))
-
-      val datasAndLabels = dataTrain.provideDataDesc ++ dataTrain.provideLabelDesc
-      val (argShapes, outputShapes, auxShapes) = symbol.inferShape(datasAndLabels)
-
-      val initializer = new Xavier(factorType = "in", magnitude = 2.34f)
-
-      val argNames = symbol.listArguments()
-      val argDict = argNames.zip(argShapes.map(NDArray.zeros(_, ctx))).toMap
-      val auxNames = symbol.listAuxiliaryStates()
-      val auxDict = auxNames.zip(auxShapes.map(NDArray.zeros(_, ctx))).toMap
-
-      val datasAndLabelsNames = datasAndLabels.map(_.name)
-      val gradDict = argNames.zip(argShapes).filter { case (name, shape) =>
-        !datasAndLabelsNames.contains(name)
-      }.map(x => x._1 -> NDArray.empty(x._2, ctx)).toMap
-
-      argDict.foreach { case (name, ndArray) =>
-        if (!datasAndLabelsNames.contains(name)) {
-          initializer.initWeight(name, ndArray)
-        }
-      }
-
-      val data = argDict("data")
-      val label = argDict("softmax_label")
-
-      val executor = symbol.bind(ctx, argDict, gradDict)
-
-      val opt = new Adam(learningRate = learningRate, wd = 0.0001f)
-
-      val paramsGrads = gradDict.toList.zipWithIndex.map { case ((name, grad), idx) =>
-        (idx, name, grad, opt.createState(idx, argDict(name)))
-      }
-
-      val evalMetric = new CustomMetric(Utils.perplexity, "perplexity")
-      val batchEndCallback = new Callback.Speedometer(batchSize, 50)
-      val epochEndCallback = Utils.doCheckpoint(s"${saveModelPath}/obama")
-
-      for (epoch <- 0 until numEpoch) {
-        // Training phase
-        val tic = System.currentTimeMillis
-        evalMetric.reset()
-        var nBatch = 0
-        var epochDone = false
-        // Iterate over training data.
-        dataTrain.reset()
-        while (!epochDone) {
-          var doReset = true
-          while (doReset && dataTrain.hasNext) {
-            val dataBatch = dataTrain.next()
-
-            data.set(dataBatch.data(0))
-            label.set(dataBatch.label(0))
-            executor.forward(isTrain = true)
-            executor.backward()
-            paramsGrads.foreach { case (idx, name, grad, optimState) =>
-              opt.update(idx, argDict(name), grad, optimState)
-            }
-
-            // evaluate at end, so out_cpu_array can lazy copy
-            evalMetric.update(dataBatch.label, executor.outputs)
-
-            nBatch += 1
-            batchEndCallback.invoke(epoch, nBatch, evalMetric)
-          }
-          if (doReset) {
-            dataTrain.reset()
-          }
-          // this epoch is done
-          epochDone = true
-        }
-        val (name, value) = evalMetric.get
-        name.zip(value).foreach { case (n, v) =>
-          logger.info(s"Epoch[$epoch] Train-$n=$v")
-        }
-        val toc = System.currentTimeMillis
-        logger.info(s"Epoch[$epoch] Time cost=${toc - tic}")
-
-        epochEndCallback.invoke(epoch, symbol, argDict, auxDict)
-      }
-      executor.dispose()
-    }
-  }
-
-  def main(args: Array[String]): Unit = {
-    val incr = new TrainCharRnn
-    val parser: CmdLineParser = new CmdLineParser(incr)
-    try {
-      parser.parseArgument(args.toList.asJava)
-      val ctx = if (incr.gpu == -1) Context.cpu() else Context.gpu(incr.gpu)
-      assert(incr.dataPath != null && incr.saveModelPath != null)
-      runTrainCharRnn(incr.dataPath, incr.saveModelPath, ctx, 75)
-    } catch {
-      case ex: Exception => {
-        logger.error(ex.getMessage, ex)
-        parser.printUsage(System.err)
-        sys.exit(1)
-      }
-    }
-  }
-}
-
-class TrainCharRnn {
-  @Option(name = "--data-path", usage = "the input train data file")
-  private val dataPath: String = "./data/obama.txt"
-  @Option(name = "--save-model-path", usage = "the model saving path")
-  private val saveModelPath: String = "./model/"
-  @Option(name = "--gpu", usage = "which gpu card to use, default is -1, means using cpu")
-  private val gpu: Int = -1
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/Utils.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/Utils.scala
deleted file mode 100644
index 3f9a9842e0a9..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/rnn/Utils.scala
+++ /dev/null
@@ -1,151 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.rnn
-
-import scala.io.Source
-import org.apache.mxnet.EvalMetric
-import org.apache.mxnet.NDArray
-import org.apache.mxnet.EpochEndCallback
-import org.apache.mxnet.Model
-import org.apache.mxnet.Symbol
-import scala.util.Random
-
-object Utils {
-
-  def readContent(path: String): String = Source.fromFile(path).mkString
-
-  // Build  a vocabulary of what char we have in the content
-  def buildVocab(path: String): Map[String, Int] = {
-    val content = readContent(path).split("\n")
-    var idx = 1 // 0 is left for zero padding
-    var theVocab = Map[String, Int]()
-    for (line <- content) {
-      for (char <- line) {
-        val key = s"$char"
-        if (!theVocab.contains(key)) {
-          theVocab = theVocab + (key -> idx)
-          idx += 1
-        }
-      }
-    }
-    theVocab
-  }
-
-  // We will assign each char with a special numerical id
-  def text2Id(sentence: String, theVocab: Map[String, Int]): Array[Int] = {
-    val words = for (char <- sentence) yield theVocab(s"$char")
-    words.toArray
-  }
-
-  // Evaluation
-  def perplexity(label: NDArray, pred: NDArray): Float = {
-    val shape = label.shape
-    val size = shape(0) * shape(1)
-    val labelT = {
-      val tmp = label.toArray.grouped(shape(1)).toArray
-      val result = Array.fill[Float](size)(0f)
-      var idx = 0
-      for (i <- 0 until shape(1)) {
-        for (j <- 0 until shape(0)) {
-          result(idx) = tmp(j)(i)
-          idx += 1
-        }
-      }
-      result
-    }
-    var loss = 0f
-    val predArray = pred.toArray.grouped(pred.shape(1)).toArray
-    for (i <- 0 until pred.shape(0)) {
-      loss += -Math.log(Math.max(1e-10, predArray(i)(labelT(i).toInt)).toFloat).toFloat
-    }
-    loss / size
-  }
-
-  def doCheckpoint(prefix: String): EpochEndCallback = new EpochEndCallback {
-    override def invoke(epoch: Int, symbol: Symbol,
-                        argParams: Map[String, NDArray],
-                        auxStates: Map[String, NDArray]): Unit = {
-      Model.saveCheckpoint(prefix, epoch + 1, symbol, argParams, auxStates)
-    }
-  }
-
-  // helper strcuture for prediction
-  def makeRevertVocab(vocab: Map[String, Int]): Map[Int, String] = {
-    var dic = Map[Int, String]()
-    vocab.foreach { case (k, v) =>
-      dic = dic + (v -> k)
-    }
-    dic
-  }
-
-  // make input from char
-  def makeInput(char: Char, vocab: Map[String, Int], arr: NDArray): Unit = {
-    val idx = vocab(s"$char")
-    val tmp = NDArray.zeros(1)
-    tmp.set(idx)
-    arr.set(tmp)
-  }
-
-  // helper function for random sample
-  def cdf(weights: Array[Float]): Array[Float] = {
-    val total = weights.sum
-    var result = Array[Float]()
-    var cumsum = 0f
-    for (w <- weights) {
-      cumsum += w
-      result = result :+ (cumsum / total)
-    }
-    result
-  }
-
-  def choice(population: Array[String], weights: Array[Float]): String = {
-    assert(population.length == weights.length)
-    val cdfVals = cdf(weights)
-    val x = Random.nextFloat()
-    var idx = 0
-    var found = false
-    for (i <- 0 until cdfVals.length) {
-      if (cdfVals(i) >= x && !found) {
-        idx = i
-        found = true
-      }
-    }
-    population(idx)
-  }
-
-  // we can use random output or fixed output by choosing largest probability
-  def makeOutput(prob: Array[Float], vocab: Map[Int, String],
-                 sample: Boolean = false, temperature: Float = 1f): String = {
-    var idx = -1
-    val char = if (sample == false) {
-      idx = ((-1f, -1) /: prob.zipWithIndex) { (max, elem) =>
-        if (max._1 < elem._1) elem else max
-      }._2
-      if (vocab.contains(idx)) vocab(idx)
-      else ""
-    } else {
-      val fixDict = Array("") ++ (1 until vocab.size + 1).map(i => vocab(i))
-      var scaleProb = prob.map(x => if (x < 1e-6) 1e-6 else if (x > 1 - 1e-6) 1 - 1e-6 else x)
-      var rescale = scaleProb.map(x => Math.exp(Math.log(x) / temperature).toFloat)
-      val sum = rescale.sum.toFloat
-      rescale = rescale.map(_ / sum)
-      choice(fixDict, rescale)
-    }
-    char
-  }
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/AlexNet.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/AlexNet.scala
deleted file mode 100644
index 108439ddc192..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/AlexNet.scala
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.visualization
-
-import org.apache.mxnet.Symbol
-
-/**
- * @author Depeng Liang
- */
-object AlexNet {
-
-  def getSymbol(numClasses: Int = 1000): Symbol = {
-    val inputData = Symbol.Variable("data")
-    // stage 1
-    val conv1 = Symbol.Convolution()()(Map(
-        "data" -> inputData, "kernel" -> "(11, 11)", "stride" -> "(4, 4)", "num_filter" -> 96))
-    val relu1 = Symbol.Activation()()(Map("data" -> conv1, "act_type" -> "relu"))
-    val pool1 = Symbol.Pooling()()(Map(
-        "data" -> relu1, "pool_type" -> "max", "kernel" -> "(3, 3)", "stride" -> "(2,2)"))
-    val lrn1 = Symbol.LRN()()(Map("data" -> pool1,
-        "alpha" -> 0.0001f, "beta" -> 0.75f, "knorm" -> 1f, "nsize" -> 5))
-    // stage 2
-    val conv2 = Symbol.Convolution()()(Map(
-        "data" -> lrn1, "kernel" -> "(5, 5)", "pad" -> "(2, 2)", "num_filter" -> 256))
-    val relu2 = Symbol.Activation()()(Map("data" -> conv2, "act_type" -> "relu"))
-    val pool2 = Symbol.Pooling()()(Map("data" -> relu2,
-        "kernel" -> "(3, 3)", "stride" -> "(2, 2)", "pool_type" -> "max"))
-    val lrn2 = Symbol.LRN()()(Map("data" -> pool2,
-        "alpha" -> 0.0001f, "beta" -> 0.75f, "knorm" -> 1f, "nsize" -> 5))
-    // stage 3
-    val conv3 = Symbol.Convolution()()(Map(
-        "data" -> lrn2, "kernel" -> "(3, 3)", "pad" -> "(1, 1)", "num_filter" -> 384))
-    val relu3 = Symbol.Activation()()(Map("data" -> conv3, "act_type" -> "relu"))
-    val conv4 = Symbol.Convolution()()(Map(
-        "data" -> relu3, "kernel" -> "(3, 3)", "pad" -> "(1, 1)", "num_filter" -> 384))
-    val relu4 = Symbol.Activation()()(Map("data" -> conv4, "act_type" -> "relu"))
-    val conv5 = Symbol.Convolution()()(Map(
-        "data" -> relu4, "kernel" -> "(3, 3)", "pad" -> "(1, 1)", "num_filter" -> 256))
-    val relu5 = Symbol.Activation()()(Map("data" -> conv5, "act_type" -> "relu"))
-    val pool3 = Symbol.Pooling()()(Map("data" -> relu5,
-        "kernel" -> "(3, 3)", "stride" -> "(2, 2)", "pool_type" -> "max"))
-    // stage 4
-    val flatten = Symbol.Flatten()()(Map("data" -> pool3))
-    val fc1 = Symbol.FullyConnected()()(Map("data" -> flatten, "num_hidden" -> 4096))
-    val relu6 = Symbol.Activation()()(Map("data" -> fc1, "act_type" -> "relu"))
-    val dropout1 = Symbol.Dropout()()(Map("data" -> relu6, "p" -> 0.5f))
-    // stage 5
-    val fc2 = Symbol.FullyConnected()()(Map("data" -> dropout1, "num_hidden" -> 4096))
-    val relu7 = Symbol.Activation()()(Map("data" -> fc2, "act_type" -> "relu"))
-    val dropout2 = Symbol.Dropout()()(Map("data" -> relu7, "p" -> 0.5f))
-    // stage 6
-    val fc3 = Symbol.FullyConnected()()(
-        Map("data" -> dropout2, "num_hidden" -> numClasses))
-    val softmax = Symbol.SoftmaxOutput("softmax")()(Map("data" -> fc3))
-    softmax
-  }
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/ExampleVis.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/ExampleVis.scala
deleted file mode 100644
index 2808852fd1ad..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/ExampleVis.scala
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.visualization
-
-import org.kohsuke.args4j.{CmdLineParser, Option}
-import org.slf4j.LoggerFactory
-import scala.collection.JavaConverters._
-import scala.util.parsing.json._
-import org.apache.mxnet.Shape
-import org.apache.mxnet.Symbol
-import org.apache.mxnet.Visualization
-
-/**
- * @author Depeng Liang
- */
-object ExampleVis {
-  private val logger = LoggerFactory.getLogger(classOf[ExampleVis])
-
-  val netsList = List("LeNet", "AlexNet", "VGG", "GoogleNet",
-      "Inception_BN", "Inception_V3", "ResNet_Small")
-
-  val netShapes = Map(
-      "LeNet" -> Shape(1, 1, 28, 28),
-      "AlexNet" -> Shape(1, 1, 224, 224),
-      "VGG" -> Shape(1, 1, 224, 224),
-      "GoogleNet" -> Shape(1, 1, 299, 299),
-      "Inception_BN" -> Shape(1, 1, 299, 299),
-      "Inception_V3" -> Shape(1, 1, 299, 299),
-      "ResNet_Small" -> Shape(1, 1, 28, 28)
-  )
-
-  def getNetSymbol(net: String): (Symbol, Shape) = {
-    assert(netsList.contains(net), s"Supported nets: ${netsList.mkString(", ")}")
-    net match {
-      case "LeNet" => (LeNet.getSymbol(), netShapes(net))
-      case "AlexNet" => (AlexNet.getSymbol(), netShapes(net))
-      case "VGG" => (VGG.getSymbol(), netShapes(net))
-      case "GoogleNet" => (GoogleNet.getSymbol(), netShapes(net))
-      case "Inception_BN" => (Inception_BN.getSymbol(), netShapes(net))
-      case "Inception_V3" => (Inception_V3.getSymbol(), netShapes(net))
-      case "ResNet_Small" => (ResNet_Small.getSymbol(), netShapes(net))
-    }
-  }
-
-  def main(args: Array[String]): Unit = {
-    val leis = new ExampleVis
-    val parser: CmdLineParser = new CmdLineParser(leis)
-    try {
-      parser.parseArgument(args.toList.asJava)
-      assert(leis.outDir != null)
-
-      val (sym, shape) = getNetSymbol(leis.net)
-
-      val dot = Visualization.plotNetwork(symbol = sym,
-          title = leis.net, shape = Map("data" -> shape),
-          nodeAttrs = Map("shape" -> "rect", "fixedsize" -> "false"))
-
-      dot.render(engine = "dot", format = "pdf", fileName = leis.net, path = leis.outDir)
-
-    } catch {
-      case ex: Exception => {
-        logger.error(ex.getMessage, ex)
-        parser.printUsage(System.err)
-        sys.exit(1)
-      }
-    }
-  }
-}
-
-class ExampleVis {
-  @Option(name = "--out-dir", usage = "the output path")
-  private val outDir: String = null
-  @Option(name = "--net", usage = "network to visualize, e.g. LeNet, AlexNet, VGG ...")
-  private val net: String = "LeNet"
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/GoogleNet.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/GoogleNet.scala
deleted file mode 100644
index 73bff4d039b6..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/GoogleNet.scala
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.visualization
-
-import org.apache.mxnet.Symbol
-
-/**
- * @author Depeng Liang
- */
-object GoogleNet {
-
-  def ConvFactory(data: Symbol, numFilter: Int, kernel: (Int, Int), stride: (Int, Int) = (1, 1),
-      pad: (Int, Int) = (0, 0), name: String = "", suffix: String = ""): Symbol = {
-    val conv = Symbol.Convolution(s"conv_${name}${suffix}")()(
-      Map("data" -> data, "num_filter" -> numFilter, "kernel" -> s"$kernel",
-        "stride" -> s"$stride", "pad" -> s"$pad"))
-    val act = Symbol.Activation(s"relu_${name}${suffix}")()(
-        Map("data" -> conv, "act_type" -> "relu"))
-    act
-  }
-
-  def InceptionFactory(data: Symbol, num1x1: Int, num3x3red: Int, num3x3: Int,
-      numd5x5red: Int, numd5x5: Int, pool: String, proj: Int, name: String): Symbol = {
-      // 1x1
-      val c1x1 = ConvFactory(data = data, numFilter = num1x1,
-          kernel = (1, 1), name = s"${name}_1x1")
-      // 3x3 reduce + 3x3
-      val c3x3r = ConvFactory(data = data, numFilter = num3x3red,
-          kernel = (1, 1), name = s"${name}_3x3", suffix = "_reduce")
-      val c3x3 = ConvFactory(data = c3x3r, numFilter = num3x3,
-          kernel = (3, 3), pad = (1, 1), name = s"${name}_3x3")
-      // double 3x3 reduce + double 3x3
-      val cd5x5r = ConvFactory(data = data, numFilter = numd5x5red,
-          kernel = (1, 1), name = s"${name}_5x5", suffix = "_reduce")
-      val cd5x5 = ConvFactory(data = cd5x5r, numFilter = numd5x5,
-          kernel = (5, 5), pad = (2, 2), name = s"${name}_5x5")
-      // pool + proj
-      val pooling = Symbol.Pooling(s"${pool}_pool_${name}_pool")()(Map("data" -> data,
-          "kernel" -> "(3, 3)", "stride" -> "(1, 1)", "pad" -> "(1, 1)", "pool_type" -> pool))
-      val cproj = ConvFactory(data = pooling, numFilter = proj,
-          kernel = (1, 1), name = s"${name}_proj")
-      // concat
-      val concat =
-        Symbol.Concat(s"ch_concat_${name}_chconcat")(c1x1, c3x3, cd5x5, cproj)()
-      concat
-  }
-
-  def getSymbol(numClasses: Int = 1000): Symbol = {
-    val data = Symbol.Variable("data")
-    val conv1 = ConvFactory(data, 64, kernel = (7, 7),
-        stride = (2, 2), pad = (3, 3), name = "conv1")
-    val pool1 = Symbol.Pooling()()(Map("data" -> conv1, "kernel" -> "(3, 3)",
-        "stride" -> "(2, 2)", "pool_type" -> "max"))
-    val conv2 = ConvFactory(pool1, 64, kernel = (1, 1), stride = (1, 1), name = "conv2")
-    val conv3 = ConvFactory(conv2, 192, kernel = (3, 3),
-        stride = (1, 1), pad = (1, 1), name = "conv3")
-    val pool3 = Symbol.Pooling()()(Map("data" -> conv3,
-        "kernel" -> "(3, 3)", "stride" -> "(2, 2)", "pool_type" -> "max"))
-    val in3a = InceptionFactory(pool3, 64, 96, 128, 16, 32, "max", 32, name = "in3a")
-    val in3b = InceptionFactory(in3a, 128, 128, 192, 32, 96, "max", 64, name = "in3b")
-    val pool4 = Symbol.Pooling()()(Map("data" -> in3b, "kernel" -> "(3, 3)",
-        "stride" -> "(2, 2)", "pool_type" -> "max"))
-    val in4a = InceptionFactory(pool4, 192, 96, 208, 16, 48, "max", 64, name = "in4a")
-    val in4b = InceptionFactory(in4a, 160, 112, 224, 24, 64, "max", 64, name = "in4b")
-    val in4c = InceptionFactory(in4b, 128, 128, 256, 24, 64, "max", 64, name = "in4c")
-    val in4d = InceptionFactory(in4c, 112, 144, 288, 32, 64, "max", 64, name = "in4d")
-    val in4e = InceptionFactory(in4d, 256, 160, 320, 32, 128, "max", 128, name = "in4e")
-    val pool5 = Symbol.Pooling()()(Map("data" -> in4e, "kernel" -> "(3, 3)",
-        "stride" -> "(2, 2)", "pool_type" -> "max"))
-    val in5a = InceptionFactory(pool5, 256, 160, 320, 32, 128, "max", 128, name = "in5a")
-    val in5b = InceptionFactory(in5a, 384, 192, 384, 48, 128, "max", 128, name = "in5b")
-    val pool6 = Symbol.Pooling()()(Map("data" -> in5b, "kernel" -> "(7, 7)",
-        "stride" -> "(1,1)", "pool_type" -> "avg"))
-    val flatten = Symbol.Flatten()()(Map("data" -> pool6))
-    val fc1 = Symbol.FullyConnected()()(Map("data" -> flatten, "num_hidden" -> numClasses))
-    val softmax = Symbol.SoftmaxOutput("softmax")()(Map("data" -> fc1))
-    softmax
-  }
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/Inception_BN.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/Inception_BN.scala
deleted file mode 100644
index 0956957cdf05..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/Inception_BN.scala
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.visualization
-
-import org.apache.mxnet.Symbol
-
-/**
- * @author Depeng Liang
- */
-object Inception_BN {
-
-  def ConvFactory(data: Symbol, numFilter: Int, kernel: (Int, Int), stride: (Int, Int) = (1, 1),
-      pad: (Int, Int) = (0, 0), name: String = "", suffix: String = ""): Symbol = {
-    val conv = Symbol.Convolution(s"conv_${name}${suffix}")()(
-        Map("data" -> data, "num_filter" -> numFilter, "kernel" -> s"$kernel",
-            "stride" -> s"$stride", "pad" -> s"$pad"))
-    val bn = Symbol.BatchNorm(s"bn_${name}${suffix}")()(Map("data" -> conv))
-    val act = Symbol.Activation(s"relu_${name}${suffix}")()(
-        Map("data" -> bn, "act_type" -> "relu"))
-    act
-  }
-
-  def InceptionFactoryA(data: Symbol, num1x1: Int, num3x3red: Int, num3x3: Int,
-      numd3x3red: Int, numd3x3: Int, pool: String, proj: Int, name: String): Symbol = {
-    // 1x1
-    val c1x1 = ConvFactory(data = data, numFilter = num1x1,
-        kernel = (1, 1), name = s"${name}_1x1")
-    // 3x3 reduce + 3x3
-    val c3x3r = ConvFactory(data = data, numFilter = num3x3red,
-        kernel = (1, 1), name = s"${name}_3x3", suffix = "_reduce")
-    val c3x3 = ConvFactory(data = c3x3r, numFilter = num3x3,
-        kernel = (3, 3), pad = (1, 1), name = s"${name}_3x3")
-    // double 3x3 reduce + double 3x3
-    val cd3x3r = ConvFactory(data = data, numFilter = numd3x3red,
-        kernel = (1, 1), name = s"${name}_double_3x3", suffix = "_reduce")
-    var cd3x3 = ConvFactory(data = cd3x3r, numFilter = numd3x3,
-        kernel = (3, 3), pad = (1, 1), name = s"${name}_double_3x3_0")
-    cd3x3 = ConvFactory(data = cd3x3, numFilter = numd3x3,
-        kernel = (3, 3), pad = (1, 1), name = s"${name}_double_3x3_1")
-    // pool + proj
-    val pooling = Symbol.Pooling(s"${pool}_pool_${name}_pool")()(
-        Map("data" -> data, "kernel" -> "(3, 3)", "stride" -> "(1, 1)",
-            "pad" -> "(1, 1)", "pool_type" -> pool))
-    val cproj = ConvFactory(data = pooling, numFilter = proj,
-        kernel = (1, 1), name = s"${name}_proj")
-    // concat
-    val concat = Symbol.Concat(s"ch_concat_${name}_chconcat")(c1x1, c3x3, cd3x3, cproj)()
-    concat
-  }
-
-  def InceptionFactoryB(data: Symbol, num3x3red : Int, num3x3 : Int,
-      numd3x3red : Int, numd3x3 : Int, name: String): Symbol = {
-    // 3x3 reduce + 3x3
-    val c3x3r = ConvFactory(data = data, numFilter = num3x3red,
-        kernel = (1, 1), name = s"${name}_3x3", suffix = "_reduce")
-    val c3x3 = ConvFactory(data = c3x3r, numFilter = num3x3,
-        kernel = (3, 3), pad = (1, 1), stride = (2, 2), name = s"${name}_3x3")
-    // double 3x3 reduce + double 3x3
-    val cd3x3r = ConvFactory(data = data, numFilter = numd3x3red,
-        kernel = (1, 1), name = s"${name}_double_3x3", suffix = "_reduce")
-    var cd3x3 = ConvFactory(data = cd3x3r, numFilter = numd3x3,
-        kernel = (3, 3), pad = (1, 1), stride = (1, 1), name = s"${name}_double_3x3_0")
-    cd3x3 = ConvFactory(data = cd3x3, numFilter = numd3x3,
-        kernel = (3, 3), pad = (1, 1), stride = (2, 2), name = s"${name}_double_3x3_1")
-    // pool + proj
-    val pooling = Symbol.Pooling(s"max_pool_${name}_pool")()(
-        Map("data" -> data, "kernel" -> "(3, 3)", "stride" -> "(2, 2)",
-            "pad" -> "(1, 1)", "pool_type" -> "max"))
-    // concat
-    val concat = Symbol.Concat(s"ch_concat_${name}_chconcat")(c3x3, cd3x3, pooling)()
-    concat
-  }
-
-  def getSymbol(numClasses: Int = 1000): Symbol = {
-    // data
-    val data = Symbol.Variable("data")
-    // stage 1
-    val conv1 = ConvFactory(data = data, numFilter = 64,
-        kernel = (7, 7), stride = (2, 2), pad = (3, 3), name = "conv1")
-    val pool1 = Symbol.Pooling("pool1")()(Map("data" -> conv1, "kernel" -> "(3, 3)",
-        "stride" -> "(2, 2)", "pool_type" -> "max"))
-    // stage 2
-    val conv2red = ConvFactory(data = pool1, numFilter = 64,
-        kernel = (1, 1), stride = (1, 1), name = "conv2red")
-    val conv2 = ConvFactory(data = conv2red, numFilter = 192,
-        kernel = (3, 3), stride = (1, 1), pad = (1, 1), name = "conv2")
-    val pool2 = Symbol.Pooling("pool2")()(Map("data" -> conv2, "kernel" -> "(3, 3)",
-        "stride" -> "(2, 2)", "pool_type" -> "max"))
-    // stage 2
-    val in3a = InceptionFactoryA(pool2, 64, 64, 64, 64, 96, "avg", 32, "3a")
-    val in3b = InceptionFactoryA(in3a, 64, 64, 96, 64, 96, "avg", 64, "3b")
-    val in3c = InceptionFactoryB(in3b, 128, 160, 64, 96, "3c")
-    // stage 3
-    val in4a = InceptionFactoryA(in3c, 224, 64, 96, 96, 128, "avg", 128, "4a")
-    val in4b = InceptionFactoryA(in4a, 192, 96, 128, 96, 128, "avg", 128, "4b")
-    val in4c = InceptionFactoryA(in4b, 160, 128, 160, 128, 160, "avg", 128, "4c")
-    val in4d = InceptionFactoryA(in4c, 96, 128, 192, 160, 192, "avg", 128, "4d")
-    val in4e = InceptionFactoryB(in4d, 128, 192, 192, 256, "4e")
-    // stage 4
-    val in5a = InceptionFactoryA(in4e, 352, 192, 320, 160, 224, "avg", 128, "5a")
-    val in5b = InceptionFactoryA(in5a, 352, 192, 320, 192, 224, "max", 128, "5b")
-    // global avg pooling
-    val avg = Symbol.Pooling("global_pool")()(Map("data" -> in5b, "kernel" -> "(7, 7)",
-        "stride" -> "(1, 1)", "pool_type" -> "avg"))
-    // linear classifier
-    val flatten = Symbol.Flatten("flatten")()(Map("data" -> avg))
-    val fc1 = Symbol.FullyConnected("fc1")()(
-        Map("data" -> flatten, "num_hidden" -> numClasses))
-    val softmax = Symbol.SoftmaxOutput("softmax")()(Map("data" -> fc1))
-    softmax
-  }
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/Inception_V3.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/Inception_V3.scala
deleted file mode 100644
index 8fc8f4a3edbe..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/Inception_V3.scala
+++ /dev/null
@@ -1,253 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.visualization
-
-import org.apache.mxnet.Symbol
-
-/**
- * @author Depeng Liang
- */
-object Inception_V3 {
-
-  def Conv(data: Symbol, numFilter: Int, kernel: (Int, Int) = (1, 1), stride: (Int, Int) = (1, 1),
-      pad: (Int, Int) = (0, 0), name: String = "", suffix: String = ""): Symbol = {
-    val conv = Symbol.Convolution(s"${name}${suffix}_conv2d")()(
-        Map("data" -> data,
-        "num_filter" -> numFilter, "kernel" -> s"$kernel", "stride" -> s"$stride"
-        , "pad" -> s"$pad", "no_bias" -> true))
-    val bn = Symbol.BatchNorm(s"${name}${suffix}_batchnorm")()(
-        Map("data" -> conv, "fix_gamma" -> true))
-    val act = Symbol.Activation(s"${name}${suffix}_relu")()(
-        Map("data" -> bn, "act_type" -> "relu"))
-    act
-  }
-
-  def Inception7A(
-      data: Symbol,
-      num_1x1: Int,
-      num_3x3_red: Int, num_3x3_1: Int, num_3x3_2: Int,
-      num_5x5_red: Int, num_5x5: Int,
-      pool: String, proj: Int,
-      name: String): Symbol = {
-    val tower_1x1 = Conv(data, num_1x1, name = s"${name}_conv")
-    var tower_5x5 = Conv(data, num_5x5_red,
-        name = s"${name}_tower", suffix = "_conv")
-    tower_5x5 = Conv(tower_5x5, num_5x5, kernel = (5, 5),
-        pad = (2, 2), name = s"${name}_tower", suffix = "_conv_1")
-    var tower_3x3 = Conv(data, num_3x3_red,
-        name = s"${name}_tower_1", suffix = "_conv")
-    tower_3x3 = Conv(tower_3x3, num_3x3_1, kernel = (3, 3),
-        pad = (1, 1), name = s"${name}_tower_1", suffix = "_conv_1")
-    tower_3x3 = Conv(tower_3x3, num_3x3_2, kernel = (3, 3),
-        pad = (1, 1), name = s"${name}_tower_1", suffix = "_conv_2")
-    val pooling = Symbol.Pooling(s"${pool}_pool_${name}_pool")()(
-        Map("data" -> data, "kernel" -> "(3, 3)",
-        "stride" -> "(1, 1)", "pad" -> "(1, 1)", "pool_type" -> pool))
-    val cproj = Conv(pooling, proj, name = s"${name}_tower_2", suffix = "_conv")
-    val concat = Symbol.Concat(s"ch_concat_${name}_chconcat")(
-        tower_1x1, tower_5x5, tower_3x3, cproj)()
-    concat
-  }
-
-  // First Downsample
-  def Inception7B(
-      data: Symbol,
-      num_3x3: Int,
-      num_d3x3_red: Int, num_d3x3_1: Int, num_d3x3_2: Int,
-      pool: String,
-      name: String): Symbol = {
-    val tower_3x3 = Conv(data, num_3x3, kernel = (3, 3), pad = (0, 0),
-        stride = (2, 2), name = s"${name}_conv")
-    var tower_d3x3 = Conv(data, num_d3x3_red,
-        name = s"${name}_tower", suffix = "_conv")
-    tower_d3x3 = Conv(tower_d3x3, num_d3x3_1, kernel = (3, 3),
-        pad = (1, 1), stride = (1, 1), name = s"${name}_tower", suffix = "_conv_1")
-    tower_d3x3 = Conv(tower_d3x3, num_d3x3_2, kernel = (3, 3),
-        pad = (0, 0), stride = (2, 2), name = s"${name}_tower", suffix = "_conv_2")
-    val pooling = Symbol.Pooling(s"max_pool_${name}_pool")()(Map("data" -> data,
-        "kernel" -> "(3, 3)", "stride" -> "(2, 2)", "pad" -> "(0,0)", "pool_type" -> "max"))
-    val concat = Symbol.Concat(s"ch_concat_${name}_chconcat")(
-        tower_3x3, tower_d3x3, pooling)()
-    concat
-  }
-
-  // scalastyle:off parameterNum
-  def Inception7C(
-      data: Symbol,
-      num_1x1: Int,
-      num_d7_red: Int, num_d7_1: Int, num_d7_2: Int,
-      num_q7_red: Int, num_q7_1: Int, num_q7_2: Int, num_q7_3: Int, num_q7_4: Int,
-      pool: String, proj: Int,
-      name: String): Symbol = {
-    val tower_1x1 = Conv(data = data, numFilter = num_1x1,
-        kernel = (1, 1), name = s"${name}_conv")
-    var tower_d7 = Conv(data = data, numFilter = num_d7_red,
-        name = s"${name}_tower", suffix = "_conv")
-    tower_d7 = Conv(data = tower_d7, numFilter = num_d7_1, kernel = (1, 7),
-        pad = (0, 3), name = s"${name}_tower", suffix = "_conv_1")
-    tower_d7 = Conv(data = tower_d7, numFilter = num_d7_2, kernel = (7, 1),
-        pad = (3, 0), name = s"${name}_tower", suffix = "_conv_2")
-    var tower_q7 = Conv(data = data, numFilter = num_q7_red,
-        name = s"${name}_tower_1", suffix = "_conv")
-    tower_q7 = Conv(data = tower_q7, numFilter = num_q7_1, kernel = (7, 1),
-        pad = (3, 0), name = s"${name}_tower_1", suffix = "_conv_1")
-    tower_q7 = Conv(data = tower_q7, numFilter = num_q7_2, kernel = (1, 7),
-        pad = (0, 3), name = s"${name}_tower_1", suffix = "_conv_2")
-    tower_q7 = Conv(data = tower_q7, numFilter = num_q7_3, kernel = (7, 1),
-        pad = (3, 0), name = s"${name}_tower_1", suffix = "_conv_3")
-    tower_q7 = Conv(data = tower_q7, numFilter = num_q7_4, kernel = (1, 7),
-        pad = (0, 3), name = s"${name}_tower_1", suffix = "_conv_4")
-    val pooling = Symbol.Pooling(s"${pool}_pool_${name}_pool")()(
-        Map("data" -> data, "kernel" -> "(3, 3)",
-        "stride" -> "(1, 1)", "pad" -> "(1, 1)", "pool_type" -> pool))
-    val cproj = Conv(data = pooling, numFilter = proj, kernel = (1, 1),
-        name = s"${name}_tower_2", suffix = "_conv")
-    // concat
-    val concat = Symbol.Concat(s"ch_concat_${name}_chconcat")(
-        tower_1x1, tower_d7, tower_q7, cproj)()
-    concat
-  }
-
-  def Inception7D(
-      data: Symbol,
-      num_3x3_red: Int, num_3x3: Int,
-      num_d7_3x3_red: Int, num_d7_1: Int, num_d7_2: Int, num_d7_3x3: Int,
-      pool: String,
-      name: String): Symbol = {
-    var tower_3x3 = Conv(data = data, numFilter = num_3x3_red,
-        name = s"${name}_tower", suffix = "_conv")
-    tower_3x3 = Conv(data = tower_3x3, numFilter = num_3x3, kernel = (3, 3),
-        pad = (0, 0), stride = (2, 2), name = s"${name}_tower", suffix = "_conv_1")
-    var tower_d7_3x3 = Conv(data = data, numFilter = num_d7_3x3_red,
-        name = s"${name}_tower_1", suffix = "_conv")
-    tower_d7_3x3 = Conv(data = tower_d7_3x3, numFilter = num_d7_1,
-        kernel = (1, 7), pad = (0, 3), name = s"${name}_tower_1", suffix = "_conv_1")
-    tower_d7_3x3 = Conv(data = tower_d7_3x3, numFilter = num_d7_2,
-        kernel = (7, 1), pad = (3, 0), name = s"${name}_tower_1", suffix = "_conv_2")
-    tower_d7_3x3 = Conv(data = tower_d7_3x3, numFilter = num_d7_3x3,
-        kernel = (3, 3), stride = (2, 2), name = s"${name}_tower_1", suffix = "_conv_3")
-    val pooling = Symbol.Pooling(s"${pool}_pool_${name}_pool")()(
-        Map("data" -> data, "kernel" -> "(3, 3)", "stride" -> "(2, 2)", "pool_type" -> pool))
-    // concat
-    val concat = Symbol.Concat(s"ch_concat_${name}_chconcat")(
-        tower_3x3, tower_d7_3x3, pooling)()
-    concat
-  }
-
-  def Inception7E(
-      data: Symbol,
-      num_1x1: Int,
-      num_d3_red: Int, num_d3_1: Int, num_d3_2: Int,
-      num_3x3_d3_red: Int, num_3x3: Int, num_3x3_d3_1: Int, num_3x3_d3_2: Int,
-      pool: String, proj: Int,
-      name: String): Symbol = {
-    val tower_1x1 = Conv(data = data, numFilter = num_1x1,
-        kernel = (1, 1), name = s"${name}_conv")
-    val tower_d3 = Conv(data = data, numFilter = num_d3_red,
-        name = s"${name}_tower", suffix = "_conv")
-    val tower_d3_a = Conv(data = tower_d3, numFilter = num_d3_1, kernel = (1, 3),
-        pad = (0, 1), name = s"${name}_tower", suffix = "_mixed_conv")
-    val tower_d3_b = Conv(data = tower_d3, numFilter = num_d3_2, kernel = (3, 1),
-        pad = (1, 0), name = s"${name}_tower", suffix = "_mixed_conv_1")
-    var tower_3x3_d3 = Conv(data = data, numFilter = num_3x3_d3_red,
-        name = s"${name}_tower_1", suffix = "_conv")
-    tower_3x3_d3 = Conv(data = tower_3x3_d3, numFilter = num_3x3, kernel = (3, 3),
-        pad = (1, 1), name = s"${name}_tower_1", suffix = "_conv_1")
-    val tower_3x3_d3_a = Conv(data = tower_3x3_d3, numFilter = num_3x3_d3_1,
-        kernel = (1, 3), pad = (0, 1), name = s"${name}_tower_1", suffix = "_mixed_conv")
-    val tower_3x3_d3_b = Conv(data = tower_3x3_d3, numFilter = num_3x3_d3_2,
-        kernel = (3, 1), pad = (1, 0), name = s"${name}_tower_1", suffix = "_mixed_conv_1")
-    val pooling = Symbol.Pooling(s"${pool}_pool_${name}_pool")()(Map("data" -> data,
-        "kernel" -> "(3, 3)", "stride" -> "(1, 1)", "pad" -> "(1, 1)", "pool_type" -> pool))
-    val cproj = Conv(data = pooling, numFilter = proj, kernel = (1, 1),
-        name = s"${name}_tower_2", suffix = "_conv")
-    // concat
-    val concat = Symbol.Concat(s"ch_concat_${name}_chconcat")(
-        tower_1x1, tower_d3_a, tower_d3_b, tower_3x3_d3_a, tower_3x3_d3_b, cproj)()
-    concat
-  }
-  // scalastyle:on parameterNum
-
-  def getSymbol(numClasses: Int = 1000): Symbol = {
-    val data = Symbol.Variable("data")
-    // stage 1
-    val conv = Conv(data, 32, kernel = (3, 3), stride = (2, 2), name = "conv")
-    val conv_1 = Conv(conv, 32, kernel = (3, 3), name = "conv_1")
-    val conv_2 = Conv(conv_1, 64, kernel = (3, 3), pad = (1, 1), name = "conv_2")
-    var pool = Symbol.Pooling("pool")()(Map("data" -> conv_2, "kernel" -> "(3, 3)",
-        "stride" -> "(2, 2)", "pool_type" -> "max"))
-    // stage 2
-    val conv_3 = Conv(pool, 80, kernel = (1, 1), name = "conv_3")
-    val conv_4 = Conv(conv_3, 192, kernel = (3, 3), name = "conv_4")
-    val pool1 = Symbol.Pooling("pool1")()(Map("data" -> conv_4, "kernel" -> "(3, 3)",
-        "stride" -> "(2, 2)", "pool_type" -> "max"))
-    // stage 3
-    val in3a = Inception7A(pool1, 64,
-                       64, 96, 96,
-                       48, 64,
-                       "avg", 32, "mixed")
-    val in3b = Inception7A(in3a, 64,
-                       64, 96, 96,
-                       48, 64,
-                       "avg", 64, "mixed_1")
-    val in3c = Inception7A(in3b, 64,
-                       64, 96, 96,
-                       48, 64,
-                       "avg", 64, "mixed_2")
-    val in3d = Inception7B(in3c, 384,
-                       64, 96, 96,
-                       "max", "mixed_3")
-    // stage 4
-    val in4a = Inception7C(in3d, 192,
-                       128, 128, 192,
-                       128, 128, 128, 128, 192,
-                       "avg", 192, "mixed_4")
-    val in4b = Inception7C(in4a, 192,
-                       160, 160, 192,
-                       160, 160, 160, 160, 192,
-                       "avg", 192, "mixed_5")
-    val in4c = Inception7C(in4b, 192,
-                       160, 160, 192,
-                       160, 160, 160, 160, 192,
-                       "avg", 192, "mixed_6")
-    val in4d = Inception7C(in4c, 192,
-                       192, 192, 192,
-                       192, 192, 192, 192, 192,
-                       "avg", 192, "mixed_7")
-    val in4e = Inception7D(in4d, 192, 320,
-                       192, 192, 192, 192,
-                       "max", "mixed_8")
-    // stage 5
-    val in5a = Inception7E(in4e, 320,
-                       384, 384, 384,
-                       448, 384, 384, 384,
-                       "avg", 192, "mixed_9")
-    val in5b = Inception7E(in5a, 320,
-                       384, 384, 384,
-                       448, 384, 384, 384,
-                       "max", 192, "mixed_10")
-    // pool
-    pool = Symbol.Pooling("global_pool")()(Map("data" -> in5b,
-        "kernel" -> "(8, 8)", "stride" -> "(1, 1)", "pool_type" -> "avg"))
-    val flatten = Symbol.Flatten("flatten")()(Map("data" -> pool))
-    val fc1 = Symbol.FullyConnected("fc1")()(
-        Map("data" -> flatten, "num_hidden" -> numClasses))
-    val softmax = Symbol.SoftmaxOutput("softmax")()(Map("data" -> fc1))
-    softmax
-  }
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/LeNet.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/LeNet.scala
deleted file mode 100644
index 9bc82b82dcd1..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/LeNet.scala
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.visualization
-
-import org.apache.mxnet.Symbol
-
-/**
- * @author Depeng Liang
- */
-object LeNet {
-
-  def getSymbol(numClasses: Int = 10): Symbol = {
-    val data = Symbol.Variable("data")
-    // first conv
-    val conv1 = Symbol.Convolution()()(
-      Map("data" -> data, "kernel" -> "(5, 5)", "num_filter" -> 20))
-    val tanh1 = Symbol.Activation()()(Map("data" -> conv1, "act_type" -> "tanh"))
-    val pool1 = Symbol.Pooling()()(Map("data" -> tanh1, "pool_type" -> "max",
-                                       "kernel" -> "(2, 2)", "stride" -> "(2, 2)"))
-    // second conv
-    val conv2 = Symbol.Convolution()()(
-      Map("data" -> pool1, "kernel" -> "(5, 5)", "num_filter" -> 50))
-    val tanh2 = Symbol.Activation()()(Map("data" -> conv2, "act_type" -> "tanh"))
-    val pool2 = Symbol.Pooling()()(Map("data" -> tanh2, "pool_type" -> "max",
-                                       "kernel" -> "(2, 2)", "stride" -> "(2, 2)"))
-    // first fullc
-    val flatten = Symbol.Flatten()()(Map("data" -> pool2))
-    val fc1 = Symbol.FullyConnected()()(Map("data" -> flatten, "num_hidden" -> 500))
-    val tanh3 = Symbol.Activation()()(Map("data" -> fc1, "act_type" -> "tanh"))
-    // second fullc
-    val fc2 = Symbol.FullyConnected()()(
-        Map("data" -> tanh3, "num_hidden" -> numClasses))
-    // loss
-    val lenet = Symbol.SoftmaxOutput(name = "softmax")()(Map("data" -> fc2))
-    lenet
-  }
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/ResNet_Small.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/ResNet_Small.scala
deleted file mode 100644
index 42413d298608..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/ResNet_Small.scala
+++ /dev/null
@@ -1,110 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.visualization
-
-import org.apache.mxnet.Symbol
-
-/**
- * @author Depeng Liang
- */
-object ResNet_Small {
-
-  sealed trait ConvType
-  case object ConvWithoutAct extends ConvType
-  case object ConvWitAct extends ConvType
-
-  def convFactory(data: Symbol, numFilter: Int, kernel: (Int, Int),
-      stride: (Int, Int), pad: (Int, Int), actType: String = "relu",
-      convType: ConvType = ConvWitAct): Symbol = convType match {
-    case ConvWitAct => {
-      val conv = Symbol.Convolution()()(Map("data" -> data,
-          "num_filter" -> numFilter, "kernel" -> s"$kernel",
-          "stride" -> s"$stride", "pad" -> s"$pad"))
-      val bn = Symbol.BatchNorm()()(Map("data" -> conv))
-      val act = Symbol.Activation()()(Map("data" -> bn, "act_type" -> actType))
-      act
-    }
-    case ConvWithoutAct => {
-      val conv = Symbol.Convolution()()(Map("data" -> data,
-          "num_filter" -> numFilter, "kernel" -> s"$kernel",
-          "stride" -> s"$stride", "pad" -> s"$pad"))
-      val bn = Symbol.BatchNorm()()(Map("data" -> conv))
-      bn
-    }
-  }
-
-  def residualFactory(data: Symbol, numFilter: Int, dimMatch: Boolean): Symbol = {
-    if (dimMatch == true) {
-        val identityData = data
-        val conv1 = convFactory(data = data, numFilter = numFilter, kernel = (3, 3),
-            stride = (1, 1), pad = (1, 1), actType = "relu", convType = ConvWitAct)
-
-        val conv2 = convFactory(data = conv1, numFilter = numFilter, kernel = (3, 3),
-            stride = (1, 1), pad = (1, 1), convType = ConvWithoutAct)
-        val newData = identityData + conv2
-        val act = Symbol.Activation()()(Map("data" -> newData, "act_type" -> "relu"))
-        act
-    } else {
-        val conv1 = convFactory(data = data, numFilter = numFilter, kernel = (3, 3),
-            stride = (2, 2), pad = (1, 1), actType = "relu", convType = ConvWitAct)
-        val conv2 = convFactory(data = conv1, numFilter = numFilter, kernel = (3, 3),
-            stride = (1, 1), pad = (1, 1), convType = ConvWithoutAct)
-
-        // adopt project method in the paper when dimension increased
-        val projectData = convFactory(data = data, numFilter = numFilter, kernel = (1, 1),
-            stride = (2, 2), pad = (0, 0), convType = ConvWithoutAct)
-        val newData = projectData + conv2
-        val act = Symbol.Activation()()(Map("data" -> newData, "act_type" -> "relu"))
-        act
-    }
-  }
-
-  def residualNet(data: Symbol, n: Int): Symbol = {
-    // fisrt 2n layers
-    val data1 = (data /: (0 until n)) { (acc, elem) =>
-      residualFactory(data = acc, numFilter = 16, dimMatch = true)
-    }
-
-    // second 2n layers
-    val data2 = (data1 /: (0 until n)) { (acc, elem) =>
-      if (elem == 0) residualFactory(data = acc, numFilter = 32, dimMatch = false)
-      else residualFactory(data = acc, numFilter = 32, dimMatch = true)
-    }
-
-    // third 2n layers
-    val data3 = (data2 /: (0 until n)) { (acc, elem) =>
-      if (elem == 0) residualFactory(data = acc, numFilter = 64, dimMatch = false)
-      else residualFactory(data = acc, numFilter = 64, dimMatch = true)
-    }
-     data3
-  }
-
-  def getSymbol(numClasses: Int = 1000): Symbol = {
-    val conv = convFactory(data = Symbol.Variable("data"), numFilter = 16,
-        kernel = (3, 3), stride = (1, 1), pad = (1, 1), actType = "relu", convType = ConvWitAct)
-    // set n = 3 means get a model with 3*6+2=20 layers, set n = 9 means 9*6+2=56 layers
-    val n = 3
-    val resNet = residualNet(conv, n)
-    val pool = Symbol.Pooling()()(Map("data" -> resNet,
-        "kernel" -> "(7,7)", "pool_type" -> "avg"))
-    val flatten = Symbol.Flatten("flatten")()(Map("data" -> pool))
-    val fc = Symbol.FullyConnected("fc1")()(Map("data" -> flatten, "num_hidden" -> numClasses))
-    val softmax = Symbol.SoftmaxOutput("softmax")()(Map("data" -> fc))
-    softmax
-  }
-}
diff --git a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/VGG.scala b/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/VGG.scala
deleted file mode 100644
index abfb2efcec2d..000000000000
--- a/scala-package/examples/src/main/scala/org/apache/mxnetexamples/visualization/VGG.scala
+++ /dev/null
@@ -1,84 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.visualization
-
-import org.apache.mxnet.Symbol
-
-/**
- * @author Depeng Liang
- */
-object VGG {
-
-  def getSymbol(numClasses: Int = 1000): Symbol = {
-    // define alexnet
-    val data = Symbol.Variable("data")
-    // group 1
-    val conv1_1 = Symbol.Convolution("conv1_1")()(
-        Map("data" -> data, "num_filter" -> 64, "pad" -> "(1,1)", "kernel" -> "(3,3)"))
-    val relu1_1 = Symbol.Activation("relu1_1")()(Map("data" -> conv1_1, "act_type" -> "relu"))
-    val pool1 = Symbol.Pooling("pool1")()(
-        Map("data" -> relu1_1, "pool_type" -> "max", "kernel" -> "(2, 2)", "stride" -> "(2,2)"))
-    // group 2
-    val conv2_1 = Symbol.Convolution("conv2_1")()(
-        Map("data" -> pool1, "num_filter" -> 128, "pad" -> "(1,1)", "kernel" -> "(3,3)"))
-    val relu2_1 = Symbol.Activation("relu2_1")()(Map("data" -> conv2_1, "act_type" -> "relu"))
-    val pool2 = Symbol.Pooling("pool2")()(
-        Map("data" -> relu2_1, "pool_type" -> "max", "kernel" -> "(2, 2)", "stride" -> "(2,2)"))
-    // group 3
-    val conv3_1 = Symbol.Convolution("conv3_1")()(
-        Map("data" -> pool2, "num_filter" -> 256, "pad" -> "(1,1)", "kernel" -> "(3,3)"))
-    val relu3_1 = Symbol.Activation("relu3_1")()(Map("data" -> conv3_1, "act_type" -> "relu"))
-    val conv3_2 = Symbol.Convolution("conv3_2")()(
-        Map("data" -> relu3_1, "num_filter" -> 256, "pad" -> "(1,1)", "kernel" -> "(3,3)"))
-    val relu3_2 = Symbol.Activation("relu3_2")()(Map("data" -> conv3_2 , "act_type" -> "relu"))
-    val pool3 = Symbol.Pooling("pool3")()(
-        Map("data" -> relu3_2, "pool_type" -> "max", "kernel" -> "(2, 2)", "stride" -> "(2,2)"))
-    // group 4
-    val conv4_1 = Symbol.Convolution("conv4_1")()(
-        Map("data" -> pool3, "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)"))
-    val relu4_1 = Symbol.Activation("relu4_1")()(Map("data" -> conv4_1 , "act_type" -> "relu"))
-    val conv4_2 = Symbol.Convolution("conv4_2")()(
-        Map("data" -> relu4_1, "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)"))
-    val relu4_2 = Symbol.Activation("relu4_2")()(Map("data" -> conv4_2 , "act_type" -> "relu"))
-    val pool4 = Symbol.Pooling("pool4")()(
-        Map("data" -> relu4_2, "pool_type" -> "max", "kernel" -> "(2, 2)", "stride" -> "(2,2)"))
-    // group 5
-    val conv5_1 = Symbol.Convolution("conv5_1")()(
-        Map("data" -> pool4, "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)"))
-    val relu5_1 = Symbol.Activation("relu5_1")()(Map("data" -> conv5_1, "act_type" -> "relu"))
-    val conv5_2 = Symbol.Convolution("conv5_2")()(
-        Map("data" -> relu5_1, "num_filter" -> 512, "pad" -> "(1,1)", "kernel" -> "(3,3)"))
-    val relu5_2 = Symbol.Activation("relu5_2")()(Map("data" -> conv5_2, "act_type" -> "relu"))
-    val pool5 = Symbol.Pooling("pool5")()(
-        Map("data" -> relu5_2, "pool_type" -> "max", "kernel" -> "(2, 2)", "stride" -> "(2,2)"))
-    // group 6
-    val flatten = Symbol.Flatten("flatten")()(Map("data" -> pool5))
-    val fc6 = Symbol.FullyConnected("fc6")()(Map("data" -> flatten, "num_hidden" -> 4096))
-    val relu6 = Symbol.Activation("relu6")()(Map("data" -> fc6, "act_type" -> "relu"))
-    val drop6 = Symbol.Dropout("drop6")()(Map("data" -> relu6, "p" -> 0.5f))
-    // group 7
-    val fc7 = Symbol.FullyConnected("fc7")()(Map("data" -> drop6, "num_hidden" -> 4096))
-    val relu7 = Symbol.Activation("relu7")()(Map("data" -> fc7, "act_type" -> "relu"))
-    val drop7 = Symbol.Dropout("drop7")()(Map("data" -> relu7, "p" -> 0.5f))
-    // output
-    val fc8 = Symbol.FullyConnected("fc8")()(
-        Map("data" -> drop7, "num_hidden" -> numClasses))
-    val softmax = Symbol.SoftmaxOutput("softmax")()(Map("data" -> fc8))
-    softmax
-  }
-}
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/benchmark/ScalaInferenceBenchmarkSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/benchmark/ScalaInferenceBenchmarkSuite.scala
deleted file mode 100644
index 548f2e4122e0..000000000000
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/benchmark/ScalaInferenceBenchmarkSuite.scala
+++ /dev/null
@@ -1,113 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.mxnetexamples.benchmark
-
-import java.io.File
-
-import org.apache.mxnetexamples.Util
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
-import org.slf4j.LoggerFactory
-
-import scala.language.postfixOps
-import scala.sys.process.Process
-
-class ScalaInferenceBenchmarkSuite  extends FunSuite with BeforeAndAfterAll {
-  private val logger = LoggerFactory.getLogger(classOf[ScalaInferenceBenchmarkSuite])
-  override def beforeAll(): Unit = {
-  }
-
-  test("Testing Benchmark -- Image Classification") {
-    logger.info("Downloading resnet-18 model")
-    val tempDirPath = System.getProperty("java.io.tmpdir")
-    logger.info("tempDirPath: %s".format(tempDirPath))
-    val baseUrl = "https://s3.us-east-2.amazonaws.com/scala-infer-models"
-    Util.downloadUrl(baseUrl + "/resnet-18/resnet-18-symbol.json",
-      tempDirPath + "/resnet18/resnet-18-symbol.json")
-    Util.downloadUrl(baseUrl + "/resnet-18/resnet-18-0000.params",
-      tempDirPath + "/resnet18/resnet-18-0000.params")
-    Util.downloadUrl(baseUrl + "/resnet-18/synset.txt",
-      tempDirPath + "/resnet18/synset.txt")
-    Util.downloadUrl("https://s3.amazonaws.com/model-server/inputs/Pug-Cookie.jpg",
-      tempDirPath + "/inputImages/resnet18/Pug-Cookie.jpg")
-    val modelDirPath = tempDirPath + File.separator + "resnet18/"
-    val inputImagePath = tempDirPath + File.separator +
-      "inputImages/resnet18/Pug-Cookie.jpg"
-    val inputImageDir = tempDirPath + File.separator + "inputImages/resnet18/"
-    val args = Array(
-      "--example", "ImageClassifierExample",
-      "--count", "1",
-      "--batchSize", "10",
-      "--model-path-prefix", s"$modelDirPath/resnet-18",
-      "--input-image", inputImagePath,
-      "--input-dir", inputImageDir
-    )
-    ScalaInferenceBenchmark.main(args)
-  }
-
-  test("Testing Benchmark -- Object Detection") {
-    logger.info("Downloading resnetssd model")
-    val tempDirPath = System.getProperty("java.io.tmpdir")
-    logger.info("tempDirPath: %s".format(tempDirPath))
-    val modelBase = "https://s3.amazonaws.com/model-server/models/resnet50_ssd/"
-    val imageBase = "https://s3.amazonaws.com/model-server/inputs/"
-    Util.downloadUrl(modelBase + "resnet50_ssd_model-symbol.json",
-      tempDirPath + "/resnetssd/resnet50_ssd_model-symbol.json")
-    Util.downloadUrl(modelBase + "resnet50_ssd_model-0000.params",
-      tempDirPath + "/resnetssd/resnet50_ssd_model-0000.params")
-    Util.downloadUrl(modelBase + "synset.txt",
-      tempDirPath + "/resnetssd/synset.txt")
-    Util.downloadUrl(imageBase + "dog-ssd.jpg",
-      tempDirPath + "/inputImages/resnetssd/dog-ssd.jpg")
-    val modelDirPath = tempDirPath + File.separator + "resnetssd/"
-    val inputImagePath = tempDirPath + File.separator +
-      "inputImages/resnetssd/dog-ssd.jpg"
-    val inputImageDir = tempDirPath + File.separator + "inputImages/resnetssd/"
-    val args = Array(
-      "--example", "ObjectDetectionExample",
-      "--count", "1",
-      "--batchSize", "10",
-      "--model-path-prefix", s"$modelDirPath/resnet50_ssd_model",
-      "--input-image", inputImagePath,
-      "--input-dir", inputImageDir
-    )
-    ScalaInferenceBenchmark.main(args)
-  }
-
-  test("Testing Benchmark -- charRNN Model") {
-    logger.info("Downloading LSTM model")
-    val tempDirPath = System.getProperty("java.io.tmpdir")
-    logger.info("tempDirPath: %s".format(tempDirPath))
-    val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/RNN/"
-    Util.downloadUrl(baseUrl + "obama.zip", tempDirPath + "/RNN/obama.zip")
-    Util.downloadUrl(baseUrl + "sherlockholmes.train.txt",
-      tempDirPath + "/RNN/sherlockholmes.train.txt")
-    Util.downloadUrl(baseUrl + "sherlockholmes.valid.txt",
-      tempDirPath + "/RNN/sherlockholmes.valid.txt")
-    // TODO: Need to confirm with Windows
-    Process(s"unzip $tempDirPath/RNN/obama.zip -d $tempDirPath/RNN/") !
-
-    val args = Array(
-      "--example", "CharRnn",
-      "--count", "1",
-      "--data-path", s"$tempDirPath/RNN/obama.txt",
-      "--model-prefix", s"$tempDirPath/RNN/obama",
-      "--starter-sentence", "The joke"
-    )
-    ScalaInferenceBenchmark.main(args)
-  }
-
-}
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/cnntextclassification/CNNClassifierExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/cnntextclassification/CNNClassifierExampleSuite.scala
deleted file mode 100644
index 0424c1262835..000000000000
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/cnntextclassification/CNNClassifierExampleSuite.scala
+++ /dev/null
@@ -1,70 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.cnntextclassification
-
-import java.io.File
-import java.net.URL
-
-import org.apache.commons.io.FileUtils
-import org.apache.mxnet.Context
-import org.apache.mxnetexamples.Util
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
-import org.slf4j.LoggerFactory
-
-import scala.language.postfixOps
-import scala.sys.process.Process
-
-/**
-  * Integration test for CNN example.
-  */
-class CNNClassifierExampleSuite extends FunSuite with BeforeAndAfterAll {
-  private val logger = LoggerFactory.getLogger(classOf[CNNClassifierExampleSuite])
-
-  test("Example CI - CNN Example") {
-
-    if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
-      System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
-      val context = Context.gpu()
-      val tempDirPath = System.getProperty("java.io.tmpdir")
-      val w2vModelName = "GoogleNews-vectors-negative300-SLIM.bin"
-
-      logger.info("tempDirPath: %s".format(tempDirPath))
-
-      logger.info("Downloading CNN text...")
-      val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala"
-      Util.downloadUrl(baseUrl + "/scala-example-ci/CNN/rt-polarity.pos",
-        tempDirPath + "/CNN/rt-polarity.pos")
-      Util.downloadUrl(baseUrl + "/scala-example-ci/CNN/rt-polarity.neg",
-        tempDirPath + "/CNN/rt-polarity.neg")
-      logger.info("Downloading pretrianed Word2Vec Model, may take a while")
-      Util.downloadUrl(baseUrl + "/scala-example-ci/CNN/" + w2vModelName,
-        tempDirPath + "/CNN/" + w2vModelName)
-
-      val modelDirPath = tempDirPath + File.separator + "CNN"
-
-      val output = CNNTextClassification.test(modelDirPath + File.separator + w2vModelName,
-        modelDirPath, context, modelDirPath)
-
-      Process("rm -rf " + modelDirPath) !
-
-      assert(output >= 0.4f)
-    } else {
-      logger.info("Skip this test as it intended for GPU only")
-    }
-  }
-}
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/gan/GanExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/gan/GanExampleSuite.scala
deleted file mode 100644
index f6872aedfe69..000000000000
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/gan/GanExampleSuite.scala
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.gan
-
-import java.io.File
-
-import org.apache.mxnet.Context
-import org.apache.mxnetexamples.Util
-import org.scalatest.{BeforeAndAfterAll, FunSuite, Ignore}
-import org.slf4j.LoggerFactory
-
-import scala.language.postfixOps
-import scala.sys.process.Process
-
-class GanExampleSuite extends FunSuite with BeforeAndAfterAll{
-  private val logger = LoggerFactory.getLogger(classOf[GanExampleSuite])
-
-  test("Example CI: Test GAN MNIST") {
-      if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
-        System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
-        logger.info("Downloading mnist model")
-        val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci"
-        val tempDirPath = System.getProperty("java.io.tmpdir")
-        val modelDirPath = tempDirPath + File.separator + "mnist/"
-        logger.info("tempDirPath: %s".format(tempDirPath))
-        Util.downloadUrl(baseUrl + "/mnist/mnist.zip", tempDirPath + "/mnist/mnist.zip")
-        // TODO: Need to confirm with Windows
-        Process("unzip " + tempDirPath + "/mnist/mnist.zip -d "
-          + tempDirPath + "/mnist/") !
-
-        val context = Context.gpu()
-
-        val output = GanMnist.runTraining(modelDirPath, context, modelDirPath, 3)
-
-        Process("rm -rf " + modelDirPath) !
-
-        assert(output >= 0.0f)
-      } else {
-        logger.info("GPU test only, skipped...")
-      }
-  }
-}
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/IMClassificationExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/IMClassificationExampleSuite.scala
deleted file mode 100644
index 1b5e362fec2b..000000000000
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/imclassification/IMClassificationExampleSuite.scala
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.imclassification
-
-import java.io.File
-
-import org.apache.mxnet.{Context, DType, ResourceScope}
-import org.apache.mxnetexamples.Util
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
-import org.slf4j.LoggerFactory
-
-import scala.language.postfixOps
-import scala.sys.process.Process
-
-/**
-  * Integration test for MNIST example.
-  */
-class IMClassificationExampleSuite extends FunSuite with BeforeAndAfterAll {
-  private val logger = LoggerFactory.getLogger(classOf[IMClassificationExampleSuite])
-
-  test("Example CI: Test MNIST Training") {
-
-    ResourceScope.using() {
-      logger.info("Downloading mnist model")
-      val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci"
-      val tempDirPath = System.getProperty("java.io.tmpdir")
-      val modelDirPath = tempDirPath + File.separator + "mnist/"
-      logger.info("tempDirPath: %s".format(tempDirPath))
-      Util.downloadUrl(baseUrl + "/mnist/mnist.zip",
-        tempDirPath + "/mnist/mnist.zip")
-      // TODO: Need to confirm with Windows
-      Process("unzip " + tempDirPath + "/mnist/mnist.zip -d "
-        + tempDirPath + "/mnist/") !
-
-      var context = Context.cpu()
-
-      val valAccuracy = TrainModel.test("mlp", modelDirPath)
-      Process("rm -rf " + modelDirPath) !
-
-      assert(valAccuracy >= 0.95f)
-    }
-  }
-
-  for(model <- List("mlp", "lenet", "resnet")) {
-    test(s"Example CI: Test Image Classification Model ${model}") {
-      ResourceScope.using() {
-        val valAccuracy = TrainModel.test(model, "", 10, 1, benchmark = true)
-      }
-    }
-  }
-
-  for(model <- List("mlp", "lenet", "resnet")) {
-    test(s"Example CI: Test Image Classification Model ${model} with Float64 input") {
-      ResourceScope.using() {
-        val valAccuracy = TrainModel.test(model, "", 10, 1, benchmark = true,
-          dtype = DType.Float64)
-      }
-    }
-  }
-
-}
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala
deleted file mode 100644
index d7233a277e59..000000000000
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/imageclassifier/ImageClassifierExampleSuite.scala
+++ /dev/null
@@ -1,78 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.infer.imageclassifier
-
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
-import org.slf4j.LoggerFactory
-import java.io.File
-import org.apache.mxnet.Context
-import org.apache.mxnet.ResourceScope
-import org.apache.mxnetexamples.Util
-
-import scala.language.postfixOps
-import sys.process.Process
-
-/**
-  * Integration test for imageClassifier example.
-  */
-class ImageClassifierExampleSuite extends FunSuite with BeforeAndAfterAll {
-  private val logger = LoggerFactory.getLogger(classOf[ImageClassifierExampleSuite])
-
-  test("testImageClassifierExample") {
-    logger.info("Downloading resnet-18 model")
-
-    val tempDirPath = System.getProperty("java.io.tmpdir")
-    logger.info("tempDirPath: %s".format(tempDirPath))
-
-    val baseUrl = "https://s3.us-east-2.amazonaws.com/scala-infer-models"
-
-    Util.downloadUrl(baseUrl + "/resnet-18/resnet-18-symbol.json",
-      tempDirPath + "/resnet18/resnet-18-symbol.json")
-    Util.downloadUrl(baseUrl + "/resnet-18/resnet-18-0000.params",
-      tempDirPath + "/resnet18/resnet-18-0000.params")
-    Util.downloadUrl(baseUrl + "/resnet-18/synset.txt",
-      tempDirPath + "/resnet18/synset.txt")
-    Util.downloadUrl("https://s3.amazonaws.com/model-server/inputs/Pug-Cookie.jpg",
-      tempDirPath + "/inputImages/resnet18/Pug-Cookie.jpg")
-
-    ResourceScope.using() {
-      val modelDirPath = tempDirPath + File.separator + "resnet18/"
-      val inputImagePath = tempDirPath + File.separator +
-        "inputImages/resnet18/Pug-Cookie.jpg"
-      val inputImageDir = tempDirPath + File.separator + "inputImages/resnet18/"
-
-      var context = Context.cpu()
-      if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
-            System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
-        context = Context.gpu()
-      }
-
-      val output = ImageClassifierExample.runInferenceOnSingleImage(modelDirPath + "resnet-18",
-                                                                    inputImagePath, context)
-
-      val outputList = ImageClassifierExample.runInferenceOnBatchOfImage(modelDirPath + "resnet-18",
-                                                                         inputImageDir, context)
-
-      Process("rm -rf " + modelDirPath + " " + inputImageDir) !
-
-      assert(output(0).toList.head._1 === "n02110958 pug, pug-dog")
-      assert(outputList(0).toList.head._1 === "n02110958 pug, pug-dog")
-    }
-
-  }
-}
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/predictor/PredictorExampleSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/predictor/PredictorExampleSuite.scala
deleted file mode 100644
index f04f06a37cdd..000000000000
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/infer/predictor/PredictorExampleSuite.scala
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.infer.predictor
-
-import java.io.File
-
-import org.apache.mxnet._
-import org.apache.mxnetexamples.Util
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
-import org.slf4j.LoggerFactory
-
-class PredictorExampleSuite extends FunSuite with BeforeAndAfterAll {
-  private val logger = LoggerFactory.getLogger(classOf[PredictorExampleSuite])
-  private var modelDirPrefix = ""
-  private var inputImagePath = ""
-  private var context = Context.cpu()
-
-  override def beforeAll(): Unit = {
-    logger.info("Downloading resnet-18 model")
-
-    val tempDirPath = System.getProperty("java.io.tmpdir")
-    logger.info("tempDirPath: %s".format(tempDirPath))
-
-    val baseUrl = "https://s3.us-east-2.amazonaws.com/scala-infer-models"
-
-    Util.downloadUrl(baseUrl + "/resnet-18/resnet-18-symbol.json",
-      tempDirPath + "/resnet18/resnet-18-symbol.json")
-    Util.downloadUrl(baseUrl + "/resnet-18/resnet-18-0000.params",
-      tempDirPath + "/resnet18/resnet-18-0000.params")
-    Util.downloadUrl(baseUrl + "/resnet-18/synset.txt",
-      tempDirPath + "/resnet18/synset.txt")
-    Util.downloadUrl("https://s3.amazonaws.com/model-server/inputs/Pug-Cookie.jpg",
-      tempDirPath + "/inputImages/resnet18/Pug-Cookie.jpg")
-
-    modelDirPrefix = tempDirPath + File.separator + "resnet18/resnet-18"
-    inputImagePath = tempDirPath + File.separator +
-      "inputImages/resnet18/Pug-Cookie.jpg"
-
-    if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
-      System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
-      context = Context.gpu()
-    }
-    val props = System.getProperties
-    props.setProperty("mxnet.disableShapeCheck", "true")
-  }
-
-  override def afterAll(): Unit = {
-    val props = System.getProperties
-    props.setProperty("mxnet.disableShapeCheck", "false")
-  }
-
-  test("test Predictor With Fixed Shape and random shape") {
-    ResourceScope.using() {
-      val inputDesc = IndexedSeq(new DataDesc("data", Shape(1, 3, 224, 224),
-                                              DType.Float32, Layout.NCHW))
-      val predictor = PredictorExample.loadModel(modelDirPrefix, inputDesc, context, 0)
-      // fix size
-      var img = PredictorExample.preProcess(inputImagePath, 224, 224)
-      var result = PredictorExample.doInference(predictor, img)(0)
-      var top1 = PredictorExample.postProcess(modelDirPrefix, result.toArray)
-      assert(top1 === "n02110958 pug, pug-dog")
-      // random size 512
-      img = PredictorExample.preProcess(inputImagePath, 512, 512)
-      result = PredictorExample.doInference(predictor, img)(0)
-      top1 = PredictorExample.postProcess(modelDirPrefix, result.toArray)
-      assert(top1 === "n02110958 pug, pug-dog")
-      // original size
-      img = PredictorExample.preProcess(inputImagePath, 1024, 576)
-      result = PredictorExample.doInference(predictor, img)(0)
-      top1 = PredictorExample.postProcess(modelDirPrefix, result.toArray)
-      assert(top1 === "n02110958 pug, pug-dog")
-    }
-  }
-}
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/multitask/MultiTaskSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/multitask/MultiTaskSuite.scala
deleted file mode 100644
index 45361f79005d..000000000000
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/multitask/MultiTaskSuite.scala
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.multitask
-
-import org.apache.mxnet._
-import org.slf4j.LoggerFactory
-import org.apache.mxnet.Context
-
-import org.scalatest.FunSuite
-
-
-/**
-  * Integration test for Multi-task example.
-  */
-class MultiTaskSuite extends FunSuite {
-  test("Multitask Test") {
-    val logger = LoggerFactory.getLogger(classOf[MultiTaskSuite])
-    if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
-      System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
-      logger.info("Multitask Test...")
-
-      ResourceScope.using() {
-        val batchSize = 100
-        val numEpoch = 3
-        val ctx = Context.gpu()
-
-        val modelPath = ExampleMultiTask.getTrainingData
-        val (executor, evalMetric) = ExampleMultiTask.train(batchSize, numEpoch, ctx, modelPath)
-        evalMetric.get.foreach { case (name, value) =>
-          assert(value >= 0.95f)
-        }
-        executor.dispose()
-      }
-    } else {
-      logger.info("GPU test only, skipped...")
-    }
-  }
-}
diff --git a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/rnn/ExampleRNNSuite.scala b/scala-package/examples/src/test/scala/org/apache/mxnetexamples/rnn/ExampleRNNSuite.scala
deleted file mode 100644
index 16982df8ea2e..000000000000
--- a/scala-package/examples/src/test/scala/org/apache/mxnetexamples/rnn/ExampleRNNSuite.scala
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnetexamples.rnn
-
-
-import org.apache.mxnet.{Context, ResourceScope}
-import org.apache.mxnetexamples.Util
-import org.scalatest.{BeforeAndAfterAll, FunSuite, Ignore}
-import org.slf4j.LoggerFactory
-
-import scala.language.postfixOps
-import scala.sys.process.Process
-
-class ExampleRNNSuite extends FunSuite with BeforeAndAfterAll {
-  private val logger = LoggerFactory.getLogger(classOf[ExampleRNNSuite])
-
-  override def beforeAll(): Unit = {
-    logger.info("Downloading LSTM model")
-    val tempDirPath = System.getProperty("java.io.tmpdir")
-    logger.info("tempDirPath: %s".format(tempDirPath))
-    val baseUrl = "https://s3.us-east-2.amazonaws.com/mxnet-scala/scala-example-ci/RNN/"
-    Util.downloadUrl(baseUrl + "obama.zip", tempDirPath + "/RNN/obama.zip")
-    Util.downloadUrl(baseUrl + "sherlockholmes.train.txt",
-      tempDirPath + "/RNN/sherlockholmes.train.txt")
-    Util.downloadUrl(baseUrl + "sherlockholmes.valid.txt",
-      tempDirPath + "/RNN/sherlockholmes.valid.txt")
-    // TODO: Need to confirm with Windows
-    Process(s"unzip $tempDirPath/RNN/obama.zip -d $tempDirPath/RNN/") !
-  }
-
-  test("Example CI: Test LSTM Bucketing") {
-    ResourceScope.using() {
-      val tempDirPath = System.getProperty("java.io.tmpdir")
-      var ctx = Context.cpu()
-      if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
-        System.getenv("SCALA_TEST_ON_GPU").toInt == 1) {
-        ctx = Context.gpu()
-      }
-      if (!System.getenv().containsKey("CI")) {
-        LstmBucketing.runTraining(tempDirPath + "/RNN/sherlockholmes.train.txt",
-                                  tempDirPath + "/RNN/sherlockholmes.valid.txt", Array(ctx), 1)
-      } else {
-        logger.info("Skipping test on CI...")
-      }
-    }
-  }
-
-  test("Example CI: Test TrainCharRNN") {
-    ResourceScope.using() {
-      val tempDirPath = System.getProperty("java.io.tmpdir")
-      if (System.getenv().containsKey("SCALA_TEST_ON_GPU") &&
-            System.getenv("SCALA_TEST_ON_GPU").toInt == 1 &&
-            !System.getenv().containsKey("CI")) {
-        val ctx = Context.gpu()
-        TrainCharRnn.runTrainCharRnn(tempDirPath + "/RNN/obama.txt",
-          tempDirPath, ctx, 1)
-      } else {
-        logger.info("CPU not supported for this test, skipped...")
-      }
-    }
-  }
-
-  test("Example CI: Test Inference on CharRNN") {
-    ResourceScope.using() {
-      val tempDirPath = System.getProperty("java.io.tmpdir")
-      val ctx = Context.gpu()
-      TestCharRnn.runInferenceCharRNN(tempDirPath + "/RNN/obama.txt",
-        tempDirPath + "/RNN/obama", "The joke")
-    }
-  }
-}
diff --git a/scala-package/spark/src/main/scala/org/apache/mxnet/spark/example/ClassificationExample.scala b/scala-package/spark/src/main/scala/org/apache/mxnet/spark/example/ClassificationExample.scala
deleted file mode 100644
index 2026bdee9fea..000000000000
--- a/scala-package/spark/src/main/scala/org/apache/mxnet/spark/example/ClassificationExample.scala
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnet.spark.example
-
-import org.apache.mxnet.spark.MXNet
-import org.apache.mxnet.{Symbol, NDArray, Context, Shape}
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.rdd.RDD
-import org.apache.spark.{SparkContext, SparkConf}
-import org.kohsuke.args4j.{Option, CmdLineParser}
-import org.slf4j.{LoggerFactory, Logger}
-
-import scala.collection.mutable.ArrayBuffer
-import scala.collection.JavaConverters._
-
-class ClassificationExample
-object ClassificationExample {
-  private val logger: Logger = LoggerFactory.getLogger(classOf[ClassificationExample])
-  def main(args: Array[String]): Unit = {
-    val cmdLine = new CommandLine
-    val parser: CmdLineParser = new CmdLineParser(cmdLine)
-    try {
-      parser.parseArgument(args.toList.asJava)
-      cmdLine.checkArguments()
-
-      val conf = new SparkConf().setAppName("MXNet")
-      val sc = new SparkContext(conf)
-
-      val network = if (cmdLine.model == "mlp") getMlp else getLenet
-      val dimension = if (cmdLine.model == "mlp") Shape(784) else Shape(1, 28, 28)
-      val devs =
-        if (cmdLine.gpus != null) cmdLine.gpus.split(',').map(id => Context.gpu(id.trim.toInt))
-        else if (cmdLine.cpus != null) cmdLine.cpus.split(',').map(id => Context.cpu(id.trim.toInt))
-        else Array(Context.cpu(0))
-
-      val mxnet = new MXNet()
-        .setBatchSize(128)
-        .setLabelName("softmax_label")
-        .setContext(devs)
-        .setDimension(dimension)
-        .setNetwork(network)
-        .setNumEpoch(cmdLine.numEpoch)
-        .setNumServer(cmdLine.numServer)
-        .setNumWorker(cmdLine.numWorker)
-        .setExecutorJars(cmdLine.jars)
-        .setJava(cmdLine.java)
-
-      val trainData = parseRawData(sc, cmdLine.input)
-      val start = System.currentTimeMillis
-      val model = mxnet.fit(trainData)
-      val timeCost = System.currentTimeMillis - start
-      logger.info("Training cost {} milli seconds", timeCost)
-      model.save(sc, cmdLine.output + "/model")
-
-      logger.info("Now do validation")
-      val valData = parseRawData(sc, cmdLine.inputVal)
-
-      val brModel = sc.broadcast(model)
-      val res = valData.mapPartitions { data =>
-        // get real labels
-        import org.apache.spark.mllib.linalg.Vector
-        val points = ArrayBuffer.empty[Vector]
-        val y = ArrayBuffer.empty[Float]
-        while (data.hasNext) {
-          val evalData = data.next()
-          y += evalData.label.toFloat
-          points += evalData.features
-        }
-
-        // get predicted labels
-        val probArrays = brModel.value.predict(points.toIterator)
-        require(probArrays.length == 1)
-        val prob = probArrays(0)
-        val py = NDArray.argmax_channel(prob.get)
-        require(y.length == py.size, s"${y.length} mismatch ${py.size}")
-
-        // I'm too lazy to calculate the accuracy
-        val res = Iterator((y.toArray zip py.toArray).map {
-          case (y1, py1) => y1 + "," + py1 }.mkString("\n"))
-
-        py.dispose()
-        prob.get.dispose()
-        res
-      }
-      res.saveAsTextFile(cmdLine.output + "/data")
-
-      sc.stop()
-    } catch {
-      case e: Throwable =>
-        e.printStackTrace()
-        logger.error(e.getMessage, e)
-        sys.exit(-1)
-    }
-  }
-
-  private def parseRawData(sc: SparkContext, path: String): RDD[LabeledPoint] = {
-    val raw = sc.textFile(path)
-    raw.map { s =>
-      val parts = s.split(' ')
-      val label = java.lang.Double.parseDouble(parts(0))
-      val features = Vectors.dense(parts(1).trim().split(',').map(java.lang.Double.parseDouble))
-      LabeledPoint(label, features)
-    }
-  }
-
-  private class CommandLine {
-    @Option(name = "--input", usage = "Input training file.")
-    val input: String = null
-    @Option(name = "--input-val", usage = "Input validation file.")
-    val inputVal: String = null
-    @Option(name = "--output", usage = "Output inferred result.")
-    val output: String = null
-    @Option(name = "--jars", usage = "Jars for running MXNet on other nodes.")
-    val jars: String = null
-    @Option(name = "--num-server", usage = "PS server number")
-    val numServer: Int = 1
-    @Option(name = "--num-worker", usage = "PS worker number")
-    val numWorker: Int = 1
-    @Option(name = "--num-epoch", usage = "Number of epochs")
-    val numEpoch: Int = 10
-    @Option(name = "--java", usage = "Java bin")
-    val java: String = "java"
-    @Option(name = "--model", usage = "Model definition")
-    val model: String = "mlp"
-    @Option(name = "--gpus", usage = "the gpus will be used, e.g. '0,1,2,3'")
-    val gpus: String = null
-    @Option(name = "--cpus", usage = "the cpus will be used, e.g. '0,1,2,3'")
-    val cpus: String = null
-
-    def checkArguments(): Unit = {
-      require(input != null, "Undefined input path")
-      require(numServer > 0, s"Invalid number of servers: $numServer")
-      require(numWorker > 0, s"Invalid number of workers: $numWorker")
-    }
-  }
-
-  def getMlp: Symbol = {
-    val data = Symbol.Variable("data")
-    val fc1 = Symbol.FullyConnected(name = "fc1")()(Map("data" -> data, "num_hidden" -> 128))
-    val act1 = Symbol.Activation(name = "relu1")()(Map("data" -> fc1, "act_type" -> "relu"))
-    val fc2 = Symbol.FullyConnected(name = "fc2")()(Map("data" -> act1, "num_hidden" -> 64))
-    val act2 = Symbol.Activation(name = "relu2")()(Map("data" -> fc2, "act_type" -> "relu"))
-    val fc3 = Symbol.FullyConnected(name = "fc3")()(Map("data" -> act2, "num_hidden" -> 10))
-    val mlp = Symbol.SoftmaxOutput(name = "softmax")()(Map("data" -> fc3))
-    mlp
-  }
-
-  // LeCun, Yann, Leon Bottou, Yoshua Bengio, and Patrick
-  // Haffner. "Gradient-based learning applied to document recognition."
-  // Proceedings of the IEEE (1998)
-  def getLenet: Symbol = {
-    val data = Symbol.Variable("data")
-    // first conv
-    val conv1 = Symbol.Convolution()()(
-      Map("data" -> data, "kernel" -> "(5, 5)", "num_filter" -> 20))
-    val tanh1 = Symbol.Activation()()(Map("data" -> conv1, "act_type" -> "tanh"))
-    val pool1 = Symbol.Pooling()()(Map("data" -> tanh1, "pool_type" -> "max",
-      "kernel" -> "(2, 2)", "stride" -> "(2, 2)"))
-    // second conv
-    val conv2 = Symbol.Convolution()()(
-      Map("data" -> pool1, "kernel" -> "(5, 5)", "num_filter" -> 50))
-    val tanh2 = Symbol.Activation()()(Map("data" -> conv2, "act_type" -> "tanh"))
-    val pool2 = Symbol.Pooling()()(Map("data" -> tanh2, "pool_type" -> "max",
-      "kernel" -> "(2, 2)", "stride" -> "(2, 2)"))
-    // first fullc
-    val flatten = Symbol.Flatten()()(Map("data" -> pool2))
-    val fc1 = Symbol.FullyConnected()()(Map("data" -> flatten, "num_hidden" -> 500))
-    val tanh3 = Symbol.Activation()()(Map("data" -> fc1, "act_type" -> "tanh"))
-    // second fullc
-    val fc2 = Symbol.FullyConnected()()(Map("data" -> tanh3, "num_hidden" -> 10))
-    // loss
-    val lenet = Symbol.SoftmaxOutput(name = "softmax")()(Map("data" -> fc2))
-    lenet
-  }
-}
diff --git a/scala-package/spark/src/test/scala/org/apache/mxnet/spark/MXNetGeneralSuite.scala b/scala-package/spark/src/test/scala/org/apache/mxnet/spark/MXNetGeneralSuite.scala
deleted file mode 100644
index 2382ca9fa358..000000000000
--- a/scala-package/spark/src/test/scala/org/apache/mxnet/spark/MXNetGeneralSuite.scala
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnet.spark
-
-import java.io.{BufferedReader, File, InputStreamReader}
-import java.nio.file.Files
-
-import scala.language.postfixOps
-import scala.sys.process.Process
-
-import org.apache.spark.SparkContext
-import org.apache.spark.mllib.linalg.Vectors
-import org.apache.spark.mllib.regression.LabeledPoint
-import org.apache.spark.rdd.RDD
-
-class MXNetGeneralSuite extends SharedSparkContext {
-
-  private var testDataDir: String = _
-
-  private def parseRawData(sc: SparkContext, path: String): RDD[LabeledPoint] = {
-    val raw = sc.textFile(path)
-    raw.map { s =>
-      val parts = s.split(' ')
-      val label = java.lang.Double.parseDouble(parts(0))
-      val features = Vectors.dense(parts(1).trim().split(',').map(java.lang.Double.parseDouble))
-      LabeledPoint(label, features)
-    }
-  }
-
-  private def downloadTestData(): Unit = {
-    Process("wget https://s3.us-east-2.amazonaws.com/mxnet-scala" +
-      "/scala-example-ci/Spark/train_full.txt" + " -P " + testDataDir + " -q") !
-  }
-
-//  override def beforeAll(): Unit = {
-//  val tempDirFile = Files.createTempDirectory(s"mxnet-spark-test-${System.currentTimeMillis()}").
-//      toFile
-//    testDataDir = tempDirFile.getPath
-//    tempDirFile.deleteOnExit()
-//    downloadTestData()
-//  }
-
-  test("Dummy test on Spark") {
-
-  }
-//  test("run spark with MLP") {
-//    val trainData = parseRawData(sc, s"$testDataDir/train_full.txt.txt")
-//    val model = buildMlp().fit(trainData)
-//    assert(model != null)
-//  }
-//
-//  test("run spark with LeNet") {
-//    val trainData = parseRawData(sc, s"$testDataDir/train_full.txt.txt")
-//    val model = buildLeNet().fit(trainData)
-//    assert(model != null)
-//  }
-}
diff --git a/scala-package/spark/src/test/scala/org/apache/mxnet/spark/SharedSparkContext.scala b/scala-package/spark/src/test/scala/org/apache/mxnet/spark/SharedSparkContext.scala
deleted file mode 100644
index 293cfa13cfce..000000000000
--- a/scala-package/spark/src/test/scala/org/apache/mxnet/spark/SharedSparkContext.scala
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.mxnet.spark
-
-import java.io.{File, FileFilter}
-
-import org.apache.mxnet.{Context, Shape, Symbol}
-
-import org.apache.spark.{SparkConf, SparkContext}
-import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}
-
-trait SharedSparkContext extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll {
-
-  protected var sc: SparkContext = _
-
-  protected val numWorkers: Int = math.min(Runtime.getRuntime.availableProcessors(), 2)
-
-  override def beforeEach() {
-    sc = new SparkContext(new SparkConf().setMaster("local[*]").setAppName("mxnet-spark-test"))
-  }
-
-  override def afterEach(): Unit = {
-    if (sc != null) {
-      sc.stop()
-    }
-  }
-
-  private def getMlp: Symbol = {
-    val data = Symbol.Variable("data")
-    val fc1 = Symbol.FullyConnected(name = "fc1")()(Map("data" -> data, "num_hidden" -> 128))
-    val act1 = Symbol.Activation(name = "relu1")()(Map("data" -> fc1, "act_type" -> "relu"))
-    val fc2 = Symbol.FullyConnected(name = "fc2")()(Map("data" -> act1, "num_hidden" -> 64))
-    val act2 = Symbol.Activation(name = "relu2")()(Map("data" -> fc2, "act_type" -> "relu"))
-    val fc3 = Symbol.FullyConnected(name = "fc3")()(Map("data" -> act2, "num_hidden" -> 10))
-    val mlp = Symbol.SoftmaxOutput(name = "softmax")()(Map("data" -> fc3))
-    mlp
-  }
-
-  def getLenet: Symbol = {
-    val data = Symbol.Variable("data")
-    // first conv
-    val conv1 = Symbol.Convolution()()(
-      Map("data" -> data, "kernel" -> "(5, 5)", "num_filter" -> 20))
-    val tanh1 = Symbol.Activation()()(Map("data" -> conv1, "act_type" -> "tanh"))
-    val pool1 = Symbol.Pooling()()(Map("data" -> tanh1, "pool_type" -> "max",
-      "kernel" -> "(2, 2)", "stride" -> "(2, 2)"))
-    // second conv
-    val conv2 = Symbol.Convolution()()(
-      Map("data" -> pool1, "kernel" -> "(5, 5)", "num_filter" -> 50))
-    val tanh2 = Symbol.Activation()()(Map("data" -> conv2, "act_type" -> "tanh"))
-    val pool2 = Symbol.Pooling()()(Map("data" -> tanh2, "pool_type" -> "max",
-      "kernel" -> "(2, 2)", "stride" -> "(2, 2)"))
-    // first fullc
-    val flatten = Symbol.Flatten()()(Map("data" -> pool2))
-    val fc1 = Symbol.FullyConnected()()(Map("data" -> flatten, "num_hidden" -> 500))
-    val tanh3 = Symbol.Activation()()(Map("data" -> fc1, "act_type" -> "tanh"))
-    // second fullc
-    val fc2 = Symbol.FullyConnected()()(Map("data" -> tanh3, "num_hidden" -> 10))
-    // loss
-    val lenet = Symbol.SoftmaxOutput(name = "softmax")()(Map("data" -> fc2))
-    lenet
-  }
-
-  private def composeWorkingDirPath: String = {
-    System.getProperty("user.dir")
-  }
-
-  private def findJars(root: String): Array[File] = {
-    val excludedSuffixes = List("bundle", "src", "javadoc", "sources")
-    new File(root).listFiles(new FileFilter {
-      override def accept(pathname: File) = {
-        pathname.getAbsolutePath.endsWith(".jar") &&
-          excludedSuffixes.forall(!pathname.getAbsolutePath.contains(_))
-      }
-    })
-  }
-
-  private def getJarFilePath(root: String): String = {
-    val jarFiles = findJars(s"$root/target/")
-    Option(jarFiles).flatMap(_.headOption).map(_.getAbsolutePath).orNull
-  }
-
-  private def getSparkJar: String = {
-    val jarFiles = findJars(s"$composeWorkingDirPath/target/")
-    Option(jarFiles).flatMap(_.headOption).map(_.getAbsolutePath).orNull
-  }
-
-  private def getNativeJars(root: String): String =
-    new File(root).listFiles().map(_.toPath).mkString(",")
-
-  protected def buildLeNet(): MXNet = {
-    val workingDir = composeWorkingDirPath
-    val assemblyRoot = s"$workingDir/../assembly"
-    new MXNet()
-      .setBatchSize(128)
-      .setLabelName("softmax_label")
-      .setContext(Array(Context.cpu(0), Context.cpu(1)))
-      .setDimension(Shape(1, 28, 28))
-      .setNetwork(getLenet)
-      .setNumEpoch(10)
-      .setNumServer(1)
-      .setNumWorker(numWorkers)
-      .setExecutorJars(s"${getJarFilePath(assemblyRoot)},$getSparkJar")
-      .setJava("java")
-  }
-
-  protected def buildMlp(): MXNet = {
-    val workingDir = composeWorkingDirPath
-    val assemblyRoot = s"$workingDir/../assembly"
-    val nativeRoot = s"$workingDir/../native/target/lib"
-
-    new MXNet()
-      .setBatchSize(128)
-      .setLabelName("softmax_label")
-      .setContext(Array(Context.cpu(0), Context.cpu(1)))
-      .setDimension(Shape(784))
-      .setNetwork(getMlp)
-      .setNumEpoch(10)
-      .setNumServer(1)
-      .setNumWorker(numWorkers)
-      .setExecutorJars(s"${getJarFilePath(assemblyRoot)},$getSparkJar,${getNativeJars(nativeRoot)}")
-      .setJava("java")
-      .setTimeout(0)
-  }
-}
diff --git a/src/executor/graph_executor.cc b/src/executor/graph_executor.cc
index a213ff040074..6519ab21acab 100644
--- a/src/executor/graph_executor.cc
+++ b/src/executor/graph_executor.cc
@@ -362,7 +362,6 @@ nnvm::Graph GraphExecutor::InitFullGraph(nnvm::Symbol symbol,
         if (type == "Convolution") return false;
         if (type == "FullyConnected") return false;
         if (type == "Concat") return false;
-        if (type == "SoftmaxOutput") return false;
         return true;
       };
 
diff --git a/src/operator/nn/mkldnn/mkldnn_base-inl.h b/src/operator/nn/mkldnn/mkldnn_base-inl.h
index 3e73103b2f14..64aa7f894134 100644
--- a/src/operator/nn/mkldnn/mkldnn_base-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_base-inl.h
@@ -200,7 +200,6 @@ struct LeakyReLUParam;
 struct ConvolutionParam;
 struct DeconvolutionParam;
 struct SoftmaxParam;
-struct SoftmaxOutputParam;
 struct TransposeParam;
 struct ReshapeParam;
 bool SupportMKLDNNAct(const ActivationParam& param);
@@ -213,7 +212,6 @@ bool SupportMKLDNNDeconv(const DeconvolutionParam& params, const NDArray &input)
 bool SupportMKLDNNSoftmax(const SoftmaxParam& param, const NDArray &input, const NDArray &output);
 bool SupportMKLDNNLogSoftmax(const SoftmaxParam& param, const NDArray &input,
                              const NDArray &output);
-bool SupportMKLDNNSoftmaxOutput(const SoftmaxOutputParam &param);
 bool SupportMKLDNNTranspose(const TransposeParam& param, const NDArray &data);
 }  // namespace op
 
diff --git a/src/operator/nn/mkldnn/mkldnn_ops-inl.h b/src/operator/nn/mkldnn/mkldnn_ops-inl.h
index 32f2e9f74130..15c2040da85a 100644
--- a/src/operator/nn/mkldnn/mkldnn_ops-inl.h
+++ b/src/operator/nn/mkldnn/mkldnn_ops-inl.h
@@ -107,11 +107,6 @@ void MKLDNNLogSoftmaxBackward(const nnvm::NodeAttrs& attrs, const OpContext &ctx
                               const std::vector<OpReqType> &req,
                               const std::vector<NDArray> &out_data);
 
-/* For softmax_output */
-void MKLDNNSoftmaxOutputForward(const nnvm::NodeAttrs& attrs, const OpContext &ctx,
-                                const std::vector<NDArray> &in_data,
-                                const std::vector<OpReqType> &req,
-                                const std::vector<NDArray> &out_data);
 
 /* For sum */
 void MKLDNNSumForward(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
diff --git a/src/operator/nn/mkldnn/mkldnn_softmax_output.cc b/src/operator/nn/mkldnn/mkldnn_softmax_output.cc
deleted file mode 100644
index dbd3abf2276d..000000000000
--- a/src/operator/nn/mkldnn/mkldnn_softmax_output.cc
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file mkldnn_softmax_output.cc
- * \brief integrate mkldnn softmax to softmax_output forward
- * \author Zhang Rong A
-*/
-
-#if MXNET_USE_MKLDNN == 1
-#include "../../softmax_output-inl.h"
-#include "./mkldnn_ops-inl.h"
-#include "./mkldnn_base-inl.h"
-namespace mxnet {
-namespace op {
-
-static mkldnn::softmax_forward::primitive_desc GetSoftmaxOutputFwdDescImpl(
-               const SoftmaxOutputParam& param, bool is_train,
-               const int axis, const mkldnn::memory &input_mem) {
-  mkldnn::memory::desc data_md = input_mem.get_desc();
-  auto cpu_engine = CpuEngine::Get()->get_engine();
-  auto prop = is_train ? mkldnn::prop_kind::forward_training
-                       : mkldnn::prop_kind::forward_scoring;
-  auto desc = mkldnn::softmax_forward::desc(prop, data_md, axis);
-  return mkldnn::softmax_forward::primitive_desc(desc, cpu_engine);
-}
-
-typedef ParamOpSign<SoftmaxOutputParam> MKLDNNSoftmaxOuputSignature;
-
-class MKLDNNSoftmaxOutputFwd {
-  std::shared_ptr<mkldnn::softmax_forward> fwd_;
-
- public:
-  const mkldnn::softmax_forward::primitive_desc fwd_pd;
-
-  MKLDNNSoftmaxOutputFwd(const SoftmaxOutputParam& param, bool is_train,
-                         const int axis, const mkldnn::memory &mem): fwd_pd(
-                         GetSoftmaxOutputFwdDescImpl(param, is_train, axis, mem)) {
-    fwd_ = std::make_shared<mkldnn::softmax_forward>(fwd_pd);
-  }
-
-  const inline mkldnn::softmax_forward &GetFwd() const {
-    return *fwd_;
-  }
-};
-
-static MKLDNNSoftmaxOutputFwd &GetSoftmaxOutputForward(const SoftmaxOutputParam& param,
-                                                       const OpContext &ctx,
-                                                       const NDArray &in_data) {
-#if DMLC_CXX11_THREAD_LOCAL
-  static thread_local
-    std::unordered_map<MKLDNNSoftmaxOuputSignature, MKLDNNSoftmaxOutputFwd, OpHash> fwds;
-#else
-  static MX_THREAD_LOCAL
-    std::unordered_map<MKLDNNSoftmaxOuputSignature, MKLDNNSoftmaxOutputFwd, OpHash> fwds;
-#endif
-  MKLDNNSoftmaxOuputSignature key(param);
-  key.AddSign(ctx.is_train);
-  key.AddSign(in_data);
-
-  //  softmax_output has no axis parameter, so use it as it original implement.
-  int axis = in_data.shape().ndim() - 1;
-
-  auto it = fwds.find(key);
-  if (it == fwds.end()) {
-    auto in_mem = *(in_data.GetMKLDNNData());
-    MKLDNNSoftmaxOutputFwd fwd(param, ctx.is_train, axis, in_mem);
-    it = AddToCache(&fwds, key, fwd);
-  }
-  return it->second;
-}
-
-//  This is only used for forward. For backward ,need double check compatibility
-bool SupportMKLDNNSoftmaxOutput(const SoftmaxOutputParam &param) {
-  return param.multi_output ? false : true;
-}
-
-void MKLDNNSoftmaxOutputForward(const nnvm::NodeAttrs& attrs,
-                                const OpContext &ctx,
-                                const std::vector<NDArray> &in_data,
-                                const std::vector<OpReqType> &req,
-                                const std::vector<NDArray> &out_data) {
-  const SoftmaxOutputParam &param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
-
-  NDArray idata = in_data[softmaxout_enum::kData];
-  NDArray odata = out_data[softmaxout_enum::kOut];
-  if (in_data[softmaxout_enum::kData].IsView() && in_data[softmaxout_enum::kData].IsMKLDNNData()) {
-    idata = in_data[softmaxout_enum::kData].Reorder2Default();
-  }
-
-  auto input_mem = idata.GetMKLDNNData();
-  auto out_mem = CreateMKLDNNMem(out_data[softmaxout_enum::kOut],
-                                 input_mem->get_desc(), req[softmaxout_enum::kOut]);
-
-  MKLDNNSoftmaxOutputFwd &fwd = GetSoftmaxOutputForward(param, ctx, idata);
-
-  MKLDNNStream *stream = MKLDNNStream::Get();
-  stream->RegisterPrimArgs(fwd.GetFwd(),
-                           {{MKLDNN_ARG_SRC, *input_mem}, {MKLDNN_ARG_DST, *out_mem.second}});
-  CommitOutput(out_data[softmaxout_enum::kOut], out_mem);
-  stream->Submit();
-}
-}   // namespace op
-}   // namespace mxnet
-#endif
-
diff --git a/src/operator/regression_output.cc b/src/operator/regression_output.cc
deleted file mode 100644
index a337ec1ca1ad..000000000000
--- a/src/operator/regression_output.cc
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file regression_ouput.cc
- * \brief Regression output operator.
-*/
-
-#include "./regression_output-inl.h"
-#include "./elemwise_op_common.h"
-
-
-#define MXNET_OPERATOR_REGISTER_REGRESSION_FWD(__name$, __kernel$, __bwdop$)           \
-  NNVM_REGISTER_OP(__name$)                                                            \
-  MXNET_ADD_SPARSE_OP_ALIAS(__name$)                                                   \
-  .set_num_inputs(2)                                                                   \
-  .set_num_outputs(1)                                                                  \
-  .set_attr<nnvm::FListInputNames>("FListInputNames",                                  \
-    [](const NodeAttrs& attrs) {                                                       \
-      return std::vector<std::string>{"data", "label"};                                \
-    })                                                                                 \
-  .set_attr<mxnet::FInferShape>("FInferShape", RegressionOpShape)                       \
-  .set_attr<nnvm::FGradient>("FGradient", RegressionOpGrad{__bwdop$})                  \
-  .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 1>)                        \
-  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                                    \
-  [](const NodeAttrs& attrs){                                                          \
-    return std::vector<std::pair<int, int> >{{0, 0}};                                  \
-  })                                                                                   \
-  .set_attr<FCompute>("FCompute<cpu>", RegressionForward<cpu, __kernel$>)              \
-  .add_argument("data", "NDArray-or-Symbol", "Input data to the function.")            \
-  .add_argument("label", "NDArray-or-Symbol", "Input label to the function.")          \
-  .add_arguments(RegressionOutputParam::__FIELDS__())
-
-#define MXNET_OPERATOR_REGISTER_REGRESSION_BWD(__name$, __kernel$)                      \
-  NNVM_REGISTER_OP(__name$)                                                             \
-  .set_num_inputs(2)                                                                    \
-  .set_num_outputs(2)                                                                   \
-  .set_attr_parser(ParamParser<RegressionOutputParam>)                                  \
-  .set_attr<nnvm::TIsBackward>("TIsBackward", true)                                     \
-  .set_attr<nnvm::FInferType>("FInferType", ElemwiseType<2, 2>)                         \
-  .set_attr<nnvm::FInplaceOption>("FInplaceOption",                                     \
-  [](const NodeAttrs& attrs){                                                           \
-    return std::vector<std::pair<int, int> >{{1, 0}};                                   \
-  })                                                                                    \
-  .set_attr<FCompute>("FCompute<cpu>", RegressionBackward<cpu, __kernel$>)
-
-namespace mxnet {
-namespace op {
-
-
-DMLC_REGISTER_PARAMETER(RegressionOutputParam);
-
-MXNET_OPERATOR_REGISTER_REGRESSION_FWD(LinearRegressionOutput,
-  mshadow_op::identity, "_backward_linear_reg_out")
-.set_attr<FInferStorageType>("FInferStorageType", RegressionInferStorageType<true>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", RegressionForwardEx<cpu, mshadow_op::identity>)
-.describe(R"code(Computes and optimizes for squared loss during backward propagation.
-Just outputs ``data`` during forward propagation.
-
-If :math:`\hat{y}_i` is the predicted value of the i-th sample, and :math:`y_i` is the corresponding target value,
-then the squared loss estimated over :math:`n` samples is defined as
-
-:math:`\text{SquaredLoss}(\textbf{Y}, \hat{\textbf{Y}} ) = \frac{1}{n} \sum_{i=0}^{n-1} \lVert  \textbf{y}_i - \hat{\textbf{y}}_i  \rVert_2`
-
-.. note::
-   Use the LinearRegressionOutput as the final output layer of a net.
-
-The storage type of ``label`` can be ``default`` or ``csr``
-
-- LinearRegressionOutput(default, default) = default
-- LinearRegressionOutput(default, csr) = default
-
-By default, gradients of this loss function are scaled by factor `1/m`, where m is the number of regression outputs of a training example.
-The parameter `grad_scale` can be used to change this scale to `grad_scale/m`.
-
-)code" ADD_FILELINE);
-
-MXNET_OPERATOR_REGISTER_REGRESSION_BWD(_backward_linear_reg_out, mshadow_op::minus)
-.set_attr<FInferStorageType>("FInferStorageType", RegressionInferStorageType<false>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", RegressionBackwardEx<cpu, mshadow_op::minus>);
-
-MXNET_OPERATOR_REGISTER_REGRESSION_FWD(MAERegressionOutput,
-  mshadow_op::identity, "_backward_mae_reg_out")
-.describe(R"code(Computes mean absolute error of the input.
-
-MAE is a risk metric corresponding to the expected value of the absolute error.
-
-If :math:`\hat{y}_i` is the predicted value of the i-th sample, and :math:`y_i` is the corresponding target value,
-then the mean absolute error (MAE) estimated over :math:`n` samples is defined as
-
-:math:`\text{MAE}(\textbf{Y}, \hat{\textbf{Y}} ) = \frac{1}{n} \sum_{i=0}^{n-1} \lVert \textbf{y}_i - \hat{\textbf{y}}_i \rVert_1`
-
-.. note::
-   Use the MAERegressionOutput as the final output layer of a net.
-
-The storage type of ``label`` can be ``default`` or ``csr``
-
-- MAERegressionOutput(default, default) = default
-- MAERegressionOutput(default, csr) = default
-
-By default, gradients of this loss function are scaled by factor `1/m`, where m is the number of regression outputs of a training example.
-The parameter `grad_scale` can be used to change this scale to `grad_scale/m`.
-
-)code" ADD_FILELINE);
-
-MXNET_OPERATOR_REGISTER_REGRESSION_BWD(_backward_mae_reg_out, mshadow_op::minus_sign);
-
-MXNET_OPERATOR_REGISTER_REGRESSION_FWD(LogisticRegressionOutput,
-  mshadow_op::sigmoid, "_backward_logistic_reg_out")
-.set_attr<FInferStorageType>("FInferStorageType", RegressionInferStorageType<true>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", RegressionForwardEx<cpu, mshadow_op::sigmoid>)
-.describe(R"code(Applies a logistic function to the input.
-
-The logistic function, also known as the sigmoid function, is computed as
-:math:`\frac{1}{1+exp(-\textbf{x})}`.
-
-Commonly, the sigmoid is used to squash the real-valued output of a linear model
-:math:`wTx+b` into the [0,1] range so that it can be interpreted as a probability.
-It is suitable for binary classification or probability prediction tasks.
-
-.. note::
-   Use the LogisticRegressionOutput as the final output layer of a net.
-
-The storage type of ``label`` can be ``default`` or ``csr``
-
-- LogisticRegressionOutput(default, default) = default
-- LogisticRegressionOutput(default, csr) = default
-
-The loss function used is the Binary Cross Entropy Loss:
-
-:math:`-{(y\log(p) + (1 - y)\log(1 - p))}`
-
-Where `y` is the ground truth probability of positive outcome for a given example, and `p` the probability predicted by the model. By default, gradients of this loss function are scaled by factor `1/m`, where m is the number of regression outputs of a training example.
-The parameter `grad_scale` can be used to change this scale to `grad_scale/m`.
-
-)code" ADD_FILELINE);
-
-MXNET_OPERATOR_REGISTER_REGRESSION_BWD(_backward_logistic_reg_out, mshadow_op::minus)
-.set_attr<FInferStorageType>("FInferStorageType", RegressionInferStorageType<false>)
-.set_attr<FComputeEx>("FComputeEx<cpu>", RegressionBackwardEx<cpu, mshadow_op::minus>);
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/regression_output.cu b/src/operator/regression_output.cu
deleted file mode 100644
index ca11b84a212d..000000000000
--- a/src/operator/regression_output.cu
+++ /dev/null
@@ -1,53 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file regression_ouput.cu
- * \brief Regression output operator.
-*/
-#include "./regression_output-inl.h"
-
-
-namespace mxnet {
-namespace op {
-
-NNVM_REGISTER_OP(LinearRegressionOutput)
-.set_attr<FCompute>("FCompute<gpu>", RegressionForward<gpu, mshadow_op::identity>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", RegressionForwardEx<gpu, mshadow_op::identity>);
-
-NNVM_REGISTER_OP(_backward_linear_reg_out)
-.set_attr<FCompute>("FCompute<gpu>", RegressionBackward<gpu, mshadow_op::minus>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", RegressionBackwardEx<gpu, mshadow_op::minus>);
-
-NNVM_REGISTER_OP(MAERegressionOutput)
-.set_attr<FCompute>("FCompute<gpu>", RegressionForward<gpu, mshadow_op::identity>);
-
-NNVM_REGISTER_OP(_backward_mae_reg_out)
-.set_attr<FCompute>("FCompute<gpu>", RegressionBackward<gpu, mshadow_op::minus_sign>);
-
-NNVM_REGISTER_OP(LogisticRegressionOutput)
-.set_attr<FCompute>("FCompute<gpu>", RegressionForward<gpu, mshadow_op::sigmoid>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", RegressionForwardEx<gpu, mshadow_op::sigmoid>);
-
-NNVM_REGISTER_OP(_backward_logistic_reg_out)
-.set_attr<FCompute>("FCompute<gpu>", RegressionBackward<gpu, mshadow_op::minus>)
-.set_attr<FComputeEx>("FComputeEx<gpu>", RegressionBackwardEx<gpu, mshadow_op::minus>);
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/softmax_output-inl.h b/src/operator/softmax_output-inl.h
deleted file mode 100644
index 22a1e5ff011c..000000000000
--- a/src/operator/softmax_output-inl.h
+++ /dev/null
@@ -1,476 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file softmax_output-inl.h
- * \brief
- * \author Bing Xu
-*/
-#ifndef MXNET_OPERATOR_SOFTMAX_OUTPUT_INL_H_
-#define MXNET_OPERATOR_SOFTMAX_OUTPUT_INL_H_
-
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <cstring>
-#include <map>
-#include <string>
-#include <vector>
-#include <utility>
-#include "./operator_common.h"
-
-namespace mxnet {
-namespace op {
-
-namespace softmaxout_enum {
-enum SoftmaxOutputOpInputs {kData, kLabel};
-enum SoftmaxOutputOpOutputs {kOut};
-enum SoftmaxOutputNormType {kNull, kBatch, kValid};
-enum SoftmaxOutputOpResource {kTempSpace};
-}  // namespace softmaxout_enum
-
-struct SoftmaxOutputParam : public dmlc::Parameter<SoftmaxOutputParam> {
-  float grad_scale;
-  float ignore_label;
-  bool multi_output;
-  bool use_ignore;
-  bool preserve_shape;
-  int normalization;
-  bool out_grad;
-  float smooth_alpha;
-  DMLC_DECLARE_PARAMETER(SoftmaxOutputParam) {
-    DMLC_DECLARE_FIELD(grad_scale).set_default(1.0f)
-    .describe("Scales the gradient by a float factor.");
-    DMLC_DECLARE_FIELD(ignore_label).set_default(-1.0f)
-    .describe("The instances whose `labels` == `ignore_label` will be ignored "
-              "during backward, if `use_ignore` is set to ``true``).");
-    DMLC_DECLARE_FIELD(multi_output).set_default(false)
-    .describe("If set to ``true``, the softmax function will be computed along "
-              "axis ``1``. This is applied when the shape "
-              "of input array differs from the shape of label array.");
-    DMLC_DECLARE_FIELD(use_ignore).set_default(false)
-    .describe("If set to ``true``, the `ignore_label` value will not contribute "
-              "to the backward gradient.");
-    DMLC_DECLARE_FIELD(preserve_shape).set_default(false)
-    .describe("If set to ``true``, the softmax function will be computed along "
-              "the last axis (``-1``).");
-    DMLC_DECLARE_FIELD(normalization)
-    .add_enum("null", softmaxout_enum::kNull)
-    .add_enum("batch", softmaxout_enum::kBatch)
-    .add_enum("valid", softmaxout_enum::kValid)
-    .set_default(softmaxout_enum::kNull)
-    .describe("Normalizes the gradient.");
-    DMLC_DECLARE_FIELD(out_grad)
-    .set_default(false)
-    .describe("Multiplies gradient with output gradient element-wise.");
-    DMLC_DECLARE_FIELD(smooth_alpha)
-    .set_default(0.0f)
-    .set_range(0.0f, 1.0f)
-    .describe("Constant for computing a label smoothed version of cross-entropy"
-              "for the backwards pass.  This constant gets subtracted from the"
-              "one-hot encoding of the gold label and distributed uniformly to"
-              "all other labels.");
-  };
-
-  bool operator==(const SoftmaxOutputParam& other) const {
-    return this->grad_scale == other.grad_scale &&
-           this->ignore_label == other.ignore_label &&
-           this->multi_output == other.multi_output &&
-           this->use_ignore == other.use_ignore &&
-           this->preserve_shape == other.preserve_shape &&
-           this->normalization == other.normalization &&
-           this->out_grad == other.out_grad &&
-           this->smooth_alpha == other.smooth_alpha;
-  }
-};
-
-template<typename xpu, typename DType>
-class SoftmaxOutputOp : public Operator {
- public:
-  explicit SoftmaxOutputOp(SoftmaxOutputParam param) : param_(param) {}
-
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 2U) << "SoftmaxOutput Input: [data, label]";
-    CHECK_EQ(out_data.size(), 1U) << "SoftmaxOutput Output: [output]";
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    if (param_.multi_output) {
-      index_t n = in_data[softmaxout_enum::kData].size(0);
-      index_t k = in_data[softmaxout_enum::kData].size(1);
-      Shape<3> s3 = Shape3(n, k, static_cast<index_t>(in_data[softmaxout_enum::kData].Size()/n/k));
-      Tensor<xpu, 3, DType> data =
-          in_data[softmaxout_enum::kData].get_with_shape<xpu, 3, DType>(s3, s);
-      Tensor<xpu, 3, DType> out =
-          out_data[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
-      Softmax(out, data);
-    } else {
-      if (param_.preserve_shape) {
-        Tensor<xpu, 2, DType> data = in_data[softmaxout_enum::kData].FlatTo2D<xpu, DType>(s);
-        Tensor<xpu, 2, DType> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
-        Softmax(out, data);
-      } else {
-        index_t n = in_data[softmaxout_enum::kData].size(0);
-        index_t k = in_data[softmaxout_enum::kData].Size()/n;
-        Shape<2> s2 = Shape2(n, k);
-        Tensor<xpu, 2, DType> data =
-            in_data[softmaxout_enum::kData].get_with_shape<xpu, 2, DType>(s2, s);
-        Tensor<xpu, 2, DType> out =
-            out_data[softmaxout_enum::kOut].get_with_shape<xpu, 2, DType>(s2, s);
-        Softmax(out, data);
-      }
-    }
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 2U);
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK_GE(in_grad.size(), 1U);
-    CHECK_GE(req.size(), 1U);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-
-    if (out_data[softmaxout_enum::kOut].shape_ ==
-        in_data[softmaxout_enum::kLabel].shape_) {
-      // use probability as label
-      Tensor<xpu, 2, DType> label = in_data[softmaxout_enum::kLabel].FlatTo2D<xpu, DType>(s);
-      Tensor<xpu, 2, DType> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
-      Tensor<xpu, 2, DType> grad = in_grad[softmaxout_enum::kData].FlatTo2D<xpu, DType>(s);
-      if (param_.out_grad) {
-        Tensor<xpu, 2, DType> ograd = out_grad[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
-        grad = scalar<DType>(param_.grad_scale) * (out - label) * ograd;
-      } else {
-        grad = (out - label) * scalar<DType>(param_.grad_scale);
-      }
-    } else if (param_.multi_output) {
-      index_t n = out_data[softmaxout_enum::kOut].size(0);
-      index_t k = out_data[softmaxout_enum::kOut].size(1);
-      Shape<3> s3 = Shape3(n, k, static_cast<index_t>(out_data[softmaxout_enum::kOut].Size()/n/k));
-      Shape<2> s2 = Shape2(s3[0], s3[2]);
-      Tensor<xpu, 2, DType> label =
-          in_data[softmaxout_enum::kLabel].get_with_shape<xpu, 2, DType>(s2, s);
-      Tensor<xpu, 3, DType> out =
-          out_data[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
-      Tensor<xpu, 3, DType> grad =
-          in_grad[softmaxout_enum::kData].get_with_shape<xpu, 3, DType>(s3, s);
-
-      index_t valid_cnt = label.shape_.Size();
-      if (param_.use_ignore) {
-          SoftmaxGrad(grad, out, label, static_cast<DType>(param_.ignore_label));
-      } else {
-          SoftmaxGrad(grad, out, label);
-      }
-      if (param_.normalization == softmaxout_enum::kBatch) {
-        valid_cnt = label.size(0);
-      } else if (param_.normalization == softmaxout_enum::kValid) {
-        int i_label = static_cast<int>(param_.ignore_label);
-        Tensor<cpu, 2, DType> workspace =
-          ctx.requested[softmaxout_enum::kTempSpace].get_host_space_typed<2, DType>(
-          label.shape_);
-        Copy(workspace, label, label.stream_);
-        for (index_t i = 0; i < workspace.size(0); ++i) {
-          for (index_t j = 0; j < workspace.size(1); ++j) {
-            if (static_cast<int>(workspace[i][j]) == i_label) {
-              valid_cnt--;
-            }
-          }
-        }
-        valid_cnt = valid_cnt == 0 ? 1 : valid_cnt;
-      } else {
-        valid_cnt = 1;
-      }
-      grad *= DType(param_.grad_scale /
-                    (param_.normalization == softmaxout_enum::kValid ? 1 : s3[2]) /
-                    valid_cnt);
-      if (param_.out_grad) {
-        Tensor<xpu, 3, DType> ograd =
-          out_grad[softmaxout_enum::kOut].get_with_shape<xpu, 3, DType>(s3, s);
-        grad *= ograd;
-      }
-    } else {
-      Shape<1> label_shape = Shape1(in_data[softmaxout_enum::kLabel].Size());
-      Shape<2> data_shape;
-      if (param_.preserve_shape) {
-        data_shape = out_data[softmaxout_enum::kOut].shape_.FlatTo2D();
-//        Tensor<xpu, 1, DType> label = in_data[softmaxout_enum::kLabel].FlatTo1D<xpu, DType>(s);
-//        Tensor<xpu, 2, DType> out = out_data[softmaxout_enum::kOut].FlatTo2D<xpu, DType>(s);
-//        Tensor<xpu, 2, DType> grad = in_grad[softmaxout_enum::kData].FlatTo2D<xpu, DType>(s);
-      } else {
-        index_t n = out_data[softmaxout_enum::kOut].size(0);
-        data_shape = Shape2(n, out_data[softmaxout_enum::kOut].Size()/n);
-      }
-      Tensor<xpu, 1, DType> label = in_data[softmaxout_enum::kLabel].get_with_shape<xpu, 1, DType>(
-          label_shape, s);
-      Tensor<xpu, 2, DType> out =
-          out_data[softmaxout_enum::kOut].get_with_shape<xpu, 2, DType>(data_shape, s);
-      Tensor<xpu, 2, DType> grad =
-          in_grad[softmaxout_enum::kData].get_with_shape<xpu, 2, DType>(data_shape, s);
-      index_t valid_cnt = label.shape_.Size();
-      if (param_.use_ignore) {
-        if (param_.smooth_alpha == 0.0f) {
-          SoftmaxGrad(grad, out, label, static_cast<DType>(param_.ignore_label));
-        } else {
-          SmoothSoftmaxGrad(grad, out, label, static_cast<DType>(param_.ignore_label),
-                            param_.smooth_alpha);
-        }
-      } else {
-        if (param_.smooth_alpha == 0.0f) {
-          SoftmaxGrad(grad, out, label);
-        } else {
-          SmoothSoftmaxGrad(grad, out, label, param_.smooth_alpha);
-        }
-      }
-      if (param_.normalization == softmaxout_enum::kBatch) {
-        valid_cnt = label.size(0);
-      } else if (param_.normalization == softmaxout_enum::kValid) {
-        int i_label = static_cast<int>(param_.ignore_label);
-        Tensor<cpu, 1, DType> workspace =
-          ctx.requested[softmaxout_enum::kTempSpace].get_host_space_typed<1, DType>(
-          label.shape_);
-        Copy(workspace, label, label.stream_);
-        for (index_t i = 0; i < label.size(0); ++i) {
-          if (static_cast<int>(workspace[i]) == i_label) {
-            valid_cnt--;
-          }
-        }
-        valid_cnt = valid_cnt == 0 ? 1 : valid_cnt;
-      } else {
-        valid_cnt = 1;
-      }
-      grad *= DType(param_.grad_scale / valid_cnt);
-      if (param_.out_grad) {
-        Tensor<xpu, 2, DType> ograd =
-          out_grad[softmaxout_enum::kOut].get_with_shape<xpu, 2, DType>(data_shape, s);
-        grad *= ograd;
-      }
-    }
-  }
-
- private:
-  SoftmaxOutputParam param_;
-};  // class SoftmaxOutputOp
-
-template<typename xpu>
-void SoftmaxOutputCompute(const nnvm::NodeAttrs& attrs,
-                          const OpContext& ctx, const std::vector<TBlob>& inputs,
-                          const std::vector<OpReqType>& req,
-                          const std::vector<TBlob>& outputs) {
-  const SoftmaxOutputParam &param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
-  const std::vector<TBlob> no_use_but_adapt_origin_api;
-  CHECK_EQ(inputs.size(), 2U);
-
-  MSHADOW_REAL_TYPE_SWITCH(inputs[softmaxout_enum::kData].type_flag_, DType, {
-    SoftmaxOutputOp<xpu, DType> op(param);
-    op.Forward(ctx, inputs, req, outputs, no_use_but_adapt_origin_api);
-  });
-}
-
-template<typename xpu>
-void SoftmaxOutputGradCompute(const nnvm::NodeAttrs& attrs,
-                              const OpContext& ctx,
-                              const std::vector<TBlob>& inputs,
-                              const std::vector<OpReqType>& req,
-                              const std::vector<TBlob>& outputs) {
-  const SoftmaxOutputParam& param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
-  const std::vector<TBlob> no_use_but_adapt_origin_api;
-  CHECK_EQ(inputs.size(), 2U);
-
-  std::vector<TBlob> out_grad{inputs[0]};
-  std::vector<TBlob> out_data{inputs[0]};
-  std::vector<TBlob> in_data(inputs.begin(), inputs.end());
-  int dtype = inputs[0].type_flag_;
-  const std::vector<TBlob> &in_grad = outputs;
-
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    SoftmaxOutputOp<xpu, DType> op(param);
-    op.Backward(ctx, out_grad, in_data, out_data, req, in_grad, no_use_but_adapt_origin_api);
-  });
-}
-
-
-#if DMLC_USE_CXX11
-class SoftmaxOutputProp : public OperatorProperty {
- public:
-  std::vector<std::string> ListArguments() const override {
-    return {"data", "label"};
-  }
-
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(mxnet::ShapeVector *in_shape,
-                  mxnet::ShapeVector *out_shape,
-                  mxnet::ShapeVector *aux_shape) const override {
-    using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, label]";
-    const mxnet::TShape &dshape = in_shape->at(0);
-    if (!shape_is_known(dshape)) return false;
-
-    // label.shape == data.shape: use probability as label
-    if (dshape != (*in_shape)[softmaxout_enum::kLabel]) {
-      if (param_.multi_output) {
-        mxnet::TShape lshape1 = Shape2(dshape[0], dshape.Size()/dshape[0]/dshape[1]);
-        mxnet::TShape lshape2(dshape.ndim() - 1, -1);
-        lshape2[0] = dshape[0];
-        for (int i = 2; i < dshape.ndim(); ++i)
-          lshape2[i-1] = dshape[i];
-        mxnet::TShape lshape3 = dshape;
-        lshape3[1] = 1;
-        if (!mxnet::ndim_is_known(in_shape->at(softmaxout_enum::kLabel))) {
-          in_shape->at(softmaxout_enum::kLabel) = lshape1;
-        } else if (in_shape->at(softmaxout_enum::kLabel) == lshape1) {
-        } else if (in_shape->at(softmaxout_enum::kLabel) == lshape2) {
-        } else if (in_shape->at(softmaxout_enum::kLabel) == lshape3) {
-        } else {
-          std::ostringstream os;
-          os << "Expecting " << lshape1 << " or " << lshape2
-             << ". But got " << in_shape->at(softmaxout_enum::kLabel);
-          throw InferShapeError(os.str(), softmaxout_enum::kLabel);
-        }
-      } else {
-        mxnet::TShape label_shape(dshape.ndim() - 1, -1);
-        for (int i = 0; i + 1 < dshape.ndim(); ++i)
-          label_shape[i] = dshape[i];
-        SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel, label_shape);
-      }
-    }
-    out_shape->clear();
-    out_shape->push_back(dshape);
-    return true;
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1U);
-    int dtype = (*in_type)[0];
-    CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (size_t i = 0; i < in_type->size(); ++i) {
-      if ((*in_type)[i] == -1) {
-        (*in_type)[i] = dtype;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
-      }
-    }
-    out_type->clear();
-    out_type->push_back(dtype);
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new SoftmaxOutputProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "SoftmaxOutput";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    if (param_.out_grad) {
-      return {in_data[softmaxout_enum::kLabel], out_data[softmaxout_enum::kOut],
-              out_grad[softmaxout_enum::kOut]};
-    } else {
-      return {in_data[softmaxout_enum::kLabel], out_data[softmaxout_enum::kOut]};
-    }
-  }
-
-  std::vector<std::pair<int, void*> > BackwardInplaceOption(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data,
-    const std::vector<void*> &in_grad) const override {
-    return {{out_data[softmaxout_enum::kOut], in_grad[softmaxout_enum::kData]}};
-  }
-
-  std::vector<std::pair<int, void*> > ForwardInplaceOption(
-    const std::vector<int> &in_data,
-    const std::vector<void*> &out_data) const override {
-    return {{in_data[softmaxout_enum::kData], out_data[softmaxout_enum::kOut]}};
-  }
-
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented.";
-    return nullptr;
-  }
-
-  Operator* CreateOperatorEx(Context ctx, mxnet::ShapeVector *in_shape,
-                             std::vector<int> *in_type) const override;
-
- protected:
-  SoftmaxOutputParam param_;
-};  // class SoftmaxOutputProp
-
-class DeprecatedSoftmaxProp : public SoftmaxOutputProp {
- public:
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    LOG(INFO) << "Softmax symbol is renamed to SoftmaxOutput. "
-      << "This API will be deprecated in Dec, 2015";
-    SoftmaxOutputProp::param_.Init(kwargs);
-  }
-
-  std::string TypeString() const override {
-    return "Softmax";
-  }
-};
-#endif  // DMLC_USE_CXX11
-
-}  // namespace op
-}  // namespace mxnet
-
-namespace std {
-template<>
-struct hash<mxnet::op::SoftmaxOutputParam> {
-  size_t operator()(const mxnet::op::SoftmaxOutputParam& val) {
-    size_t ret = 0;
-    ret = dmlc::HashCombine(ret, val.grad_scale);
-    ret = dmlc::HashCombine(ret, val.ignore_label);
-    ret = dmlc::HashCombine(ret, val.multi_output);
-    ret = dmlc::HashCombine(ret, val.use_ignore);
-    ret = dmlc::HashCombine(ret, val.preserve_shape);
-    ret = dmlc::HashCombine(ret, val.normalization);
-    ret = dmlc::HashCombine(ret, val.out_grad);
-    ret = dmlc::HashCombine(ret, val.smooth_alpha);
-    return ret;
-  }
-};
-}  // namespace std
-
-#endif  // MXNET_OPERATOR_SOFTMAX_OUTPUT_INL_H_
diff --git a/src/operator/softmax_output.cc b/src/operator/softmax_output.cc
deleted file mode 100644
index d87b78145e9e..000000000000
--- a/src/operator/softmax_output.cc
+++ /dev/null
@@ -1,285 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file softmax_output.cc
- * \brief
- * \author Bing Xu, Zhang Rong A
-*/
-#include "./softmax_output-inl.h"
-#if MXNET_USE_MKLDNN == 1
-#include "./nn/mkldnn/mkldnn_ops-inl.h"
-#include "./nn/mkldnn/mkldnn_base-inl.h"
-#endif
-namespace mxnet {
-namespace op {
-
-DMLC_REGISTER_PARAMETER(SoftmaxOutputParam);
-struct SoftmaxOutputGrad {
-  const char *op_name;
-  std::vector<nnvm::NodeEntry> operator()(const nnvm::ObjectPtr& n,
-                                          const std::vector<nnvm::NodeEntry>& ograds) const {
-  std::vector<nnvm::NodeEntry> out_data(n->num_outputs());
-  for (uint32_t i = 0; i < out_data.size(); ++i) {
-    out_data[i] = nnvm::NodeEntry{n, i, 0};
-  }
-  std::vector<nnvm::NodeEntry> heads;
-  heads.push_back(out_data[softmaxout_enum::kOut]);
-  heads.push_back(n->inputs[softmaxout_enum::kLabel]);
-
-  nnvm::ObjectPtr gnode = nnvm::Node::Create();
-  gnode->inputs = std::move(heads);
-  gnode->control_deps.emplace_back(n);
-  gnode->attrs = n->attrs;
-  gnode->attrs.op = nnvm::Op::Get("_backward_SoftmaxOutput");
-  gnode->attrs.name = n->attrs.name + "_backward";
-  std::vector<nnvm::NodeEntry> in_grad(2);
-  in_grad[0] = nnvm::NodeEntry{gnode, 0, 0};
-  in_grad[1] = nnvm::NodeEntry{gnode, 1, 0};
-  return in_grad;
-  }
-};
-
-static inline std::vector<std::string> ListArguments() {
-  return {"data", "label"};
-}
-
-static bool SoftmaxOutputType(const nnvm::NodeAttrs& attrs,
-                              std::vector<int> *in_type,
-                              std::vector<int> *out_type) {
-  CHECK_EQ(in_type->size(), 2U);
-  int dtype = (*in_type)[0];
-  if (type_is_none(dtype)) {
-    // Input type is undefined, we try backward inference
-    if (out_type->size() == 0 || type_is_none((*out_type)[0])) {
-      // Neither the input nor the output are defined,
-      // types cannot be infered for this op
-      return false;
-    } else {
-      // Input type is undefined but output type is: backward inference
-      dtype = (*out_type)[0];
-    }
-  } else {
-    // Input type is defined but output type is not: forward inference
-    out_type->clear();
-    out_type->push_back(dtype);
-  }
-  for (size_t i = 0; i < in_type->size(); ++i) {
-    if ((*in_type)[i] == -1) {
-      (*in_type)[i] = dtype;
-    } else {
-      UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
-    }
-  }
-  return true;
-}
-
-static bool SoftmaxOutputShape(const nnvm::NodeAttrs& attrs,
-                               mxnet::ShapeVector *in_shape,
-                               mxnet::ShapeVector *out_shape) {
-  using namespace mshadow;
-  const SoftmaxOutputParam& param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
-  CHECK_EQ(in_shape->size(), 2U) << "Input:[data, label]";
-  const mxnet::TShape &dshape = in_shape->at(0);
-  if (!mxnet::ndim_is_known(dshape)) return false;
-
-  // label.shape == data.shape: use probability as label
-  if (dshape != (*in_shape)[softmaxout_enum::kLabel]) {
-    if (param.multi_output) {
-      mxnet::TShape lshape1 = Shape2(dshape[0], dshape.Size()/dshape[0]/dshape[1]);
-      mxnet::TShape lshape2(dshape.ndim() - 1, -1);
-      lshape2[0] = dshape[0];
-      for (int i = 2; i < dshape.ndim(); ++i)
-        lshape2[i-1] = dshape[i];
-      mxnet::TShape lshape3 = dshape;
-      lshape3[1] = 1;
-      if (!mxnet::ndim_is_known(in_shape->at(softmaxout_enum::kLabel))) {
-        in_shape->at(softmaxout_enum::kLabel) = lshape1;
-      } else if (in_shape->at(softmaxout_enum::kLabel) == lshape1) {
-      } else if (in_shape->at(softmaxout_enum::kLabel) == lshape2) {
-      } else if (in_shape->at(softmaxout_enum::kLabel) == lshape3) {
-      } else {
-        std::ostringstream os;
-        os << "Expecting " << lshape1 << " or " << lshape2
-           << ". But got " << in_shape->at(softmaxout_enum::kLabel);
-        throw InferShapeError(os.str(), softmaxout_enum::kLabel);
-      }
-    } else {
-      mxnet::TShape label_shape(dshape.ndim() - 1, -1);
-      for (int i = 0; i + 1 < dshape.ndim(); ++i)
-        label_shape[i] = dshape[i];
-      SHAPE_ASSIGN_CHECK(*in_shape, softmaxout_enum::kLabel, label_shape);
-    }
-  }
-
-  out_shape->clear();
-  out_shape->push_back(dshape);
-  return true;
-}
-
-#if MXNET_USE_MKLDNN == 1
-inline static bool SoftmaxOutputStorageType(const nnvm::NodeAttrs& attrs,
-                                            const int dev_mask,
-                                            DispatchMode* dispatch_mode,
-                                            std::vector<int>* in_attrs,
-                                            std::vector<int>* out_attrs) {
-  CHECK_EQ(in_attrs->size(), 2);
-  CHECK_EQ(out_attrs->size(), 1);
-
-  return MKLDNNStorageType(attrs, dev_mask, true, dispatch_mode, in_attrs,
-                           out_attrs);
-}
-
-void SoftmaxOutputComputeExCPU(const nnvm::NodeAttrs &attrs,
-                               const OpContext &ctx,
-                               const std::vector<NDArray> &inputs,
-                               const std::vector<OpReqType> &req,
-                               const std::vector<NDArray> &outputs) {
-  CHECK_EQ(inputs.size(), 2U);
-  const SoftmaxOutputParam &param = nnvm::get<SoftmaxOutputParam>(attrs.parsed);
-  if (SupportMKLDNN(inputs[0]) && !ctx.is_train && SupportMKLDNNSoftmaxOutput(param)) {
-    MKLDNN_OPCHECK_INIT(false, outputs.size(), inputs, outputs);
-    MKLDNNRun(MKLDNNSoftmaxOutputForward, attrs, ctx, inputs, req, outputs);
-    MKLDNN_OPCHECK_RUN(SoftmaxOutputCompute<cpu>, attrs, ctx, inputs, req, outputs);
-    return;
-  }
-  FallBackCompute(SoftmaxOutputCompute<cpu>, attrs, ctx, inputs, req, outputs);
-}
-#endif
-
-NNVM_REGISTER_OP(SoftmaxOutput)
-.describe(R"code(Computes the gradient of cross entropy loss with respect to softmax output.
-
-- This operator computes the gradient in two steps.
-  The cross entropy loss does not actually need to be computed.
-
-  - Applies softmax function on the input array.
-  - Computes and returns the gradient of cross entropy loss w.r.t. the softmax output.
-
-- The softmax function, cross entropy loss and gradient is given by:
-
-  - Softmax Function:
-
-    .. math:: \text{softmax}(x)_i = \frac{exp(x_i)}{\sum_j exp(x_j)}
-
-  - Cross Entropy Function:
-
-    .. math:: \text{CE(label, output)} = - \sum_i \text{label}_i \log(\text{output}_i)
-
-  - The gradient of cross entropy loss w.r.t softmax output:
-
-    .. math:: \text{gradient} = \text{output} - \text{label}
-
-- During forward propagation, the softmax function is computed for each instance in the input array.
-
-  For general *N*-D input arrays with shape :math:`(d_1, d_2, ..., d_n)`. The size is
-  :math:`s=d_1 \cdot d_2 \cdot \cdot \cdot d_n`. We can use the parameters `preserve_shape`
-  and `multi_output` to specify the way to compute softmax:
-
-  - By default, `preserve_shape` is ``false``. This operator will reshape the input array
-    into a 2-D array with shape :math:`(d_1, \frac{s}{d_1})` and then compute the softmax function for
-    each row in the reshaped array, and afterwards reshape it back to the original shape
-    :math:`(d_1, d_2, ..., d_n)`.
-  - If `preserve_shape` is ``true``, the softmax function will be computed along
-    the last axis (`axis` = ``-1``).
-  - If `multi_output` is ``true``, the softmax function will be computed along
-    the second axis (`axis` = ``1``).
-
-- During backward propagation, the gradient of cross-entropy loss w.r.t softmax output array is computed.
-  The provided label can be a one-hot label array or a probability label array.
-
-  - If the parameter `use_ignore` is ``true``, `ignore_label` can specify input instances
-    with a particular label to be ignored during backward propagation. **This has no effect when
-    softmax `output` has same shape as `label`**.
-
-    Example::
-
-      data = [[1,2,3,4],[2,2,2,2],[3,3,3,3],[4,4,4,4]]
-      label = [1,0,2,3]
-      ignore_label = 1
-      SoftmaxOutput(data=data, label = label,\
-                    multi_output=true, use_ignore=true,\
-                    ignore_label=ignore_label)
-      ## forward softmax output
-      [[ 0.0320586   0.08714432  0.23688284  0.64391428]
-       [ 0.25        0.25        0.25        0.25      ]
-       [ 0.25        0.25        0.25        0.25      ]
-       [ 0.25        0.25        0.25        0.25      ]]
-      ## backward gradient output
-      [[ 0.    0.    0.    0.  ]
-       [-0.75  0.25  0.25  0.25]
-       [ 0.25  0.25 -0.75  0.25]
-       [ 0.25  0.25  0.25 -0.75]]
-      ## notice that the first row is all 0 because label[0] is 1, which is equal to ignore_label.
-
-  - The parameter `grad_scale` can be used to rescale the gradient, which is often used to
-    give each loss function different weights.
-
-  - This operator also supports various ways to normalize the gradient by `normalization`,
-    The `normalization` is applied if softmax output has different shape than the labels.
-    The `normalization` mode can be set to the followings:
-
-    - ``'null'``: do nothing.
-    - ``'batch'``: divide the gradient by the batch size.
-    - ``'valid'``: divide the gradient by the number of instances which are not ignored.
-
-)code" ADD_FILELINE)
-.set_num_inputs(2)
-.set_num_outputs(1)
-.set_attr_parser(ParamParser<SoftmaxOutputParam>)
-#if MXNET_USE_MKLDNN == 1
-.set_attr<FInferStorageType>("FInferStorageType", SoftmaxOutputStorageType)
-.set_attr<bool>("TIsMKLDNN", true)
-.set_attr<FComputeEx>("FComputeEx<cpu>", SoftmaxOutputComputeExCPU)
-#endif
-.set_attr<nnvm::FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
-  return std::vector<std::string>{"data", "label"};
-})
-.set_attr<nnvm::FListOutputNames>("FListOutputNames", [](const NodeAttrs& attrs) {
-  return std::vector<std::string>{"output"};
-})
-.set_attr<mxnet::FInferShape>("FInferShape", SoftmaxOutputShape)
-.set_attr<nnvm::FInferType>("FInferType", SoftmaxOutputType)
-.set_attr<FCompute>("FCompute<cpu>", SoftmaxOutputCompute<cpu>)
-.set_attr<nnvm::FGradient>("FGradient", SoftmaxOutputGrad{"_backward_SoftmaxOutput"})
-.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
-  return std::vector<std::pair<int, int> >{{0, 0}};
-})
-.add_argument("data", "NDArray-or-Symbol", "Input array.")
-.add_argument("label", "NDArray-or-Symbol", "Ground truth label.")
-.add_arguments(SoftmaxOutputParam::__FIELDS__());
-
-// Softmax symbol is renamed to SoftmaxOutput and deprecated since Dec, 2015
-NNVM_REGISTER_OP(SoftmaxOutput).add_alias("Softmax");
-
-NNVM_REGISTER_OP(_backward_SoftmaxOutput)
-.set_num_inputs(2)
-.set_num_outputs(2)
-.set_attr<nnvm::TIsBackward>("TIsBackward", true)
-.set_attr<nnvm::FInplaceOption>("FInplaceOption", [](const NodeAttrs& attrs){
-  return std::vector<std::pair<int, int> >{{0, 0}};
-})
-.set_attr<FResourceRequest>("FResourceRequest", [](const NodeAttrs& n){
-  return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
-})
-.set_attr_parser(ParamParser<SoftmaxOutputParam>)
-.set_attr<FCompute>("FCompute<cpu>", SoftmaxOutputGradCompute<cpu>);
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/softmax_output.cu b/src/operator/softmax_output.cu
deleted file mode 100644
index b2a41672e92a..000000000000
--- a/src/operator/softmax_output.cu
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file softmax_output.cu
- * \brief
- * \author Bing Xu
-*/
-
-#include "./softmax_output-inl.h"
-
-namespace mxnet {
-namespace op {
-
-NNVM_REGISTER_OP(SoftmaxOutput)
-.set_attr<FCompute>("FCompute<gpu>", SoftmaxOutputCompute<gpu>);
-
-NNVM_REGISTER_OP(_backward_SoftmaxOutput)
-.set_attr<FCompute>("FCompute<gpu>", SoftmaxOutputGradCompute<gpu>);
-
-}  // namespace op
-}  // namespace mxnet
-
diff --git a/src/operator/subgraph/tensorrt/nnvm_to_onnx-inl.h b/src/operator/subgraph/tensorrt/nnvm_to_onnx-inl.h
index d444e7a6239a..2d133579f0d5 100644
--- a/src/operator/subgraph/tensorrt/nnvm_to_onnx-inl.h
+++ b/src/operator/subgraph/tensorrt/nnvm_to_onnx-inl.h
@@ -120,11 +120,6 @@ void ConvertFullyConnected(NodeProto *node_proto,
                            const nnvm::IndexedGraph &ig,
                            const array_view<IndexedGraph::NodeEntry> &inputs);
 
-void ConvertSoftmaxOutput(NodeProto *node_proto,
-                          const NodeAttrs &attrs,
-                          const nnvm::IndexedGraph &ig,
-                          const array_view<IndexedGraph::NodeEntry> &inputs);
-
 void ConvertFlatten(NodeProto *node_proto,
                     const NodeAttrs &attrs,
                     const nnvm::IndexedGraph &ig,
@@ -190,7 +185,6 @@ static const std::unordered_map<std::string, ConverterFunction> converter_map =
   {"Pad", ConvertPad},
   {"Pooling", ConvertPooling},
   {"relu", ConvertRelu},
-  {"SoftmaxOutput", ConvertSoftmaxOutput}
 };
 
 typedef void (*PreprocessFunction)(const NodeAttrs &attrs,
diff --git a/src/operator/subgraph/tensorrt/nnvm_to_onnx.cc b/src/operator/subgraph/tensorrt/nnvm_to_onnx.cc
index 4f80d277cad8..3c0312623abb 100644
--- a/src/operator/subgraph/tensorrt/nnvm_to_onnx.cc
+++ b/src/operator/subgraph/tensorrt/nnvm_to_onnx.cc
@@ -42,7 +42,6 @@
 #include "../../nn/fully_connected-inl.h"
 #include "../../nn/pooling-inl.h"
 #include "../../nn/concat-inl.h"
-#include "../../softmax_output-inl.h"
 #include "../../tensor/matrix_op-inl.h"
 
 #if MXNET_USE_TENSORRT_ONNX_CHECKER
@@ -394,20 +393,6 @@ void ConvertFullyConnected(NodeProto* node_proto, const NodeAttrs& attrs,
   }
 }
 
-void ConvertSoftmaxOutput(NodeProto* node_proto, const NodeAttrs& /*attrs*/,
-                          const nnvm::IndexedGraph& /*ig*/,
-                          const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
-  node_proto->set_op_type("Softmax");
-
-  // Setting by default to 1 since MXNet doesn't provide such an attribute for softmax in its
-  // node params. This attribute is only relevant when the input is coerced to 2D, and in that
-  // case dimension 0 is assumed to be the batch dimension.
-  AttributeProto* const axis = node_proto->add_attribute();
-  axis->set_name("axis");
-  axis->set_type(AttributeProto::INT);
-  axis->set_i(1);
-}
-
 void ConvertFlatten(NodeProto* node_proto, const NodeAttrs& /*attrs*/,
                     const nnvm::IndexedGraph& /*ig*/,
                     const array_view<IndexedGraph::NodeEntry>& /*inputs*/) {
diff --git a/src/operator/subgraph/tensorrt/tensorrt-inl.h b/src/operator/subgraph/tensorrt/tensorrt-inl.h
index dcafba55959d..16cc13006d59 100644
--- a/src/operator/subgraph/tensorrt/tensorrt-inl.h
+++ b/src/operator/subgraph/tensorrt/tensorrt-inl.h
@@ -102,7 +102,6 @@ class TensorrtSelector : public SubgraphSelector {
     "Pad",
     "relu",
     "rsqrt",
-    "SoftmaxOutput"
   };
 
   const std::unordered_set<std::string> withWeightsOps = {
diff --git a/src/operator/svm_output-inl.h b/src/operator/svm_output-inl.h
deleted file mode 100644
index 71fb91175f37..000000000000
--- a/src/operator/svm_output-inl.h
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file svm_output-inl.h
- * \brief
- * \author Jonas Amaro
-*/
-#ifndef MXNET_OPERATOR_SVM_OUTPUT_INL_H_
-#define MXNET_OPERATOR_SVM_OUTPUT_INL_H_
-
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <cstring>
-#include <map>
-#include <string>
-#include <vector>
-#include <utility>
-#include "./operator_common.h"
-#include "./mshadow_op.h"
-
-namespace mxnet {
-namespace op {
-
-namespace svm_enum {
-enum SVMOutputOpInputs {kData, kLabel};
-enum SVMOutputOpOutputs {kOut};
-enum SVMOutputNormType {kNull, kBatch, kValid};
-enum SVMOutputOpResource {kTempSpace};
-}  // namespace svm_enum
-
-
-struct SVMOutputParam : public dmlc::Parameter<SVMOutputParam> {
-  float margin;
-  float regularization_coefficient;
-  bool use_linear;
-  DMLC_DECLARE_PARAMETER(SVMOutputParam) {
-    DMLC_DECLARE_FIELD(margin).set_default(1.0f)
-    .describe("The loss function penalizes outputs that lie outside this margin. "
-        "Default margin is 1.");
-    DMLC_DECLARE_FIELD(regularization_coefficient).set_default(1.0f)
-    .describe("Regularization parameter for the SVM. "
-        "This balances the tradeoff between coefficient size and error.");
-    DMLC_DECLARE_FIELD(use_linear).set_default(false)
-    .describe("Whether to use L1-SVM objective. L2-SVM objective is used by default.");
-  };
-};
-
-template<typename xpu, typename DType>
-class SVMOutputOp : public Operator {
- public:
-  explicit SVMOutputOp(SVMOutputParam param) : param_(param) {}
-
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 2U) << "Expecting [data, label]";
-    CHECK_EQ(out_data.size(), 1U) << "Expecting [output]";
-    CHECK_EQ(req.size(), 1U) << "Expecting output.size() == req.size()";
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 2, DType> data = in_data[svm_enum::kData].FlatTo2D<xpu, DType>(s);
-    Tensor<xpu, 2, DType> out = out_data[svm_enum::kOut].FlatTo2D<xpu, DType>(s);
-    Assign(out, req[svm_enum::kOut], F<mshadow_op::identity>(data));
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 2U);
-    CHECK_EQ(out_grad.size(), 1U);
-    CHECK_GE(in_grad.size(), 1U);
-    CHECK_GE(req.size(), 1U);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    const mxnet::TShape& label_shape = in_data[svm_enum::kLabel].shape_;
-
-    Tensor<xpu, 1, DType> label = in_data[svm_enum::kLabel].get_with_shape<xpu, 1, DType>(
-        Shape1(label_shape.ProdShape(0, label_shape.ndim())), s);
-    Tensor<xpu, 2, DType> out = out_data[svm_enum::kOut].FlatTo2D<xpu, DType>(s);
-    Tensor<xpu, 2, DType> grad = in_grad[svm_enum::kData].FlatTo2D<xpu, DType>(s);
-    CHECK_EQ(grad.shape_, out.shape_) << "SVMOutputs: shape mismatch";
-
-    if (param_.use_linear) {
-      L1_SVM(DType(param_.margin), DType(param_.regularization_coefficient), grad, label, out);
-    } else {
-      L2_SVM(DType(param_.margin), DType(param_.regularization_coefficient), grad, label, out);
-    }
-  }
-
- private:
-  SVMOutputParam param_;
-};  // class SVMOutputOp
-
-// Declare Factory function, used for dispatch specialization
-template<typename xpu>
-Operator* CreateOp(SVMOutputParam param, int dtype);
-
-#if DMLC_USE_CXX11
-class SVMOutputProp : public OperatorProperty {
- public:
-  std::vector<std::string> ListArguments() const override {
-    return {"data", "label"};
-  }
-
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    param_.Init(kwargs);
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(mxnet::ShapeVector *in_shape,
-                  mxnet::ShapeVector *out_shape,
-                  mxnet::ShapeVector *aux_shape) const override {
-    using namespace mshadow;
-    CHECK_EQ(in_shape->size(), 2U) << "Input:[data, label]";
-    const mxnet::TShape &dshape = in_shape->at(0);
-    if (!mxnet::ndim_is_known(dshape)) return false;
-    mxnet::TShape label_shape(dshape.ndim() - 1, -1);
-    for (int i = 0; i + 1 < dshape.ndim(); ++i)
-      label_shape[i] = dshape[i];
-    SHAPE_ASSIGN_CHECK(*in_shape, svm_enum::kLabel, label_shape);
-    out_shape->clear();
-    out_shape->push_back(dshape);
-    return true;
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1U);
-    int dtype = (*in_type)[0];
-    CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (size_t i = 0; i < in_type->size(); ++i) {
-      if ((*in_type)[i] == -1) {
-        (*in_type)[i] = dtype;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
-      }
-    }
-    out_type->clear();
-    out_type->push_back(dtype);
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new SVMOutputProp();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "SVMOutput";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {in_data[svm_enum::kLabel], out_data[svm_enum::kOut]};
-  }
-
-  std::vector<std::pair<int, void*> > BackwardInplaceOption(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data,
-    const std::vector<void*> &in_grad) const override {
-    return {{out_data[svm_enum::kOut], in_grad[svm_enum::kData]}};
-  }
-
-  std::vector<std::pair<int, void*> > ForwardInplaceOption(
-    const std::vector<int> &in_data,
-    const std::vector<void*> &out_data) const override {
-    return {{in_data[svm_enum::kData], out_data[svm_enum::kOut]}};
-  }
-
-  std::vector<ResourceRequest> BackwardResource(
-      const mxnet::ShapeVector &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
-
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented.";
-    return nullptr;
-  }
-
-  Operator* CreateOperatorEx(Context ctx, mxnet::ShapeVector *in_shape,
-                             std::vector<int> *in_type) const override;
-
- protected:
-  SVMOutputParam param_;
-};  // class SVMOutputProp
-#endif  // DMLC_USE_CXX11
-
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_SVM_OUTPUT_INL_H_
diff --git a/src/operator/svm_output.cc b/src/operator/svm_output.cc
deleted file mode 100644
index a52aa4779176..000000000000
--- a/src/operator/svm_output.cc
+++ /dev/null
@@ -1,101 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file svm_output.cc
- * \brief
- * \author Jonas Amaro
-*/
-#include "./svm_output-inl.h"
-#include "./mshadow_op.h"
-
-namespace mshadow {
-  template<typename DType>
-  inline void L1_SVM(const DType & margin,
-                     const DType & reg_coef,
-                     Tensor<cpu, 2, DType> dst,
-                     const Tensor<cpu, 1, DType> & label,
-                     const Tensor<cpu, 2, DType> & src) {
-    for (index_t y = 0; y < dst.size(0); y++) {
-      const index_t k = static_cast<int>(label[y]);
-      for (index_t x = 0; x < dst.size(1); x++) {
-        if (x == k) {
-          dst[y][k] = -DType(margin > src[y][k]) * reg_coef;
-        } else {
-          dst[y][x] = DType(margin > -src[y][x]) * reg_coef;
-        }
-      }
-    }
-  }
-
-
-  template<typename DType>
-  inline void L2_SVM(const DType & margin,
-                     const DType & reg_coef,
-                     Tensor<cpu, 2, DType> dst,
-                     const Tensor<cpu, 1, DType> & label,
-                     const Tensor<cpu, 2, DType> & src) {
-    for (index_t y = 0; y < dst.size(0); y++) {
-      const index_t k = static_cast<int>(label[y]);
-      for (index_t x = 0; x < dst.size(1); x++) {
-        if (x == k) {
-          dst[y][k] = margin > src[y][k] ?  2*(margin - src[y][k]) : DType(0.0f);
-          dst[y][k] *= -reg_coef;
-        } else {
-          dst[y][x] = margin > -src[y][x] ? (-2)*(margin + src[y][x]) : DType(0.0f);
-          dst[y][x] *= -reg_coef;
-        }
-      }
-    }
-  }
-}  // namespace mshadow
-
-namespace mxnet {
-namespace op {
-template<>
-Operator *CreateOp<cpu>(SVMOutputParam param, int dtype) {
-  Operator *op = nullptr;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new SVMOutputOp<cpu, DType>(param);
-  })
-  return op;
-}
-
-// DO_BIND_DISPATCH comes from operator_common.h
-Operator *SVMOutputProp::CreateOperatorEx(Context ctx, mxnet::ShapeVector *in_shape,
-                                     std::vector<int> *in_type) const {
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
-}
-
-DMLC_REGISTER_PARAMETER(SVMOutputParam);
-
-MXNET_REGISTER_OP_PROPERTY(SVMOutput, SVMOutputProp)
-.describe(R"code(Computes support vector machine based transformation of the input.
-
-This tutorial demonstrates using SVM as output layer for classification instead of softmax:
-https://github.com/dmlc/mxnet/tree/master/example/svm_mnist.
-
-)code")
-.add_argument("data", "NDArray-or-Symbol", "Input data for SVM transformation.")
-.add_argument("label", "NDArray-or-Symbol", "Class label for the input data.")
-.add_arguments(SVMOutputParam::__FIELDS__());
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/svm_output.cu b/src/operator/svm_output.cu
deleted file mode 100644
index 081433df377a..000000000000
--- a/src/operator/svm_output.cu
+++ /dev/null
@@ -1,119 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file svm_output.cu
- * \brief
- * \author Jonas Amaro
-*/
-
-#include "./svm_output-inl.h"
-#include <device_launch_parameters.h>
-#include "mshadow/tensor.h"
-
-
-namespace mshadow {
-
-template<int n_bits, typename DType>
-__global__  void L1_SVMKernel(const DType margin,
-                              const DType reg_coef,
-                              Tensor<gpu, 2, DType> dst,
-                              const Tensor<gpu, 1, DType> label,
-                              const Tensor<gpu, 2, DType> src) {
-  const index_t nmax = dst.size(1);
-  const unsigned n_size = 1 << n_bits;
-  const int y = blockIdx.x;
-  const int n = threadIdx.x;
-  const index_t k = static_cast<int>(label[y]);
-  for (index_t n_index = n; n_index < nmax; n_index += n_size) {
-    if (n_index == k) {
-      dst[y][k] = -DType(margin > src[y][k]) * reg_coef;
-    } else {
-      dst[y][n_index] = DType(margin > -src[y][n_index]) * reg_coef;
-    }
-  }
-}
-
-template<typename DType>
-inline void L1_SVM(const DType & margin,
-                   const DType & reg_coef,
-                   Tensor<gpu, 2, DType> dst,
-                   const Tensor<gpu, 1, DType> & label,
-                   const Tensor<gpu, 2, DType> & src) {
-  dim3 dimBlock(cuda::kBaseThreadNum);
-  dim3 dimGrid(dst.size(0));
-  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
-  L1_SVMKernel<cuda::kBaseThreadBits, DType> <<<dimGrid, dimBlock, 0, stream >>>
-    (margin, reg_coef, dst, label, src);
-  MSHADOW_CUDA_POST_KERNEL_CHECK(L1_SVMKernel);
-}
-
-
-template<int n_bits, typename DType>
-__global__  void L2_SVMKernel(const DType margin,
-                              const DType reg_coef,
-                              Tensor<gpu, 2, DType> dst,
-                              const Tensor<gpu, 1, DType> label,
-                              const Tensor<gpu, 2, DType> src) {
-  const index_t nmax = dst.size(1);
-  const unsigned n_size = 1 << n_bits;
-  const int y = blockIdx.x;
-  const int n = threadIdx.x;
-  const index_t k = static_cast<int>(label[y]);
-  for (index_t n_index = n; n_index < nmax; n_index += n_size) {
-    if (n_index == k) {
-      dst[y][k] = margin > src[y][k] ? 2 * (margin - src[y][k]) : DType(0.0f);
-      dst[y][k] *= -reg_coef;
-    } else {
-      dst[y][n_index] = margin > -src[y][n_index] ? (-2)*(margin + src[y][n_index]) : DType(0.0f);
-      dst[y][n_index] *= -reg_coef;
-    }
-  }
-}
-
-template<typename DType>
-inline void L2_SVM(const DType & margin,
-                   const DType & reg_coef,
-                   Tensor<gpu, 2, DType> dst,
-                   const Tensor<gpu, 1, DType> & label,
-                   const Tensor<gpu, 2, DType> & src) {
-  dim3 dimBlock(cuda::kBaseThreadNum);
-  dim3 dimGrid(dst.size(0));
-  cudaStream_t stream = Stream<gpu>::GetStream(dst.stream_);
-  L2_SVMKernel<cuda::kBaseThreadBits, DType> <<<dimGrid, dimBlock, 0, stream >>>
-    (margin, reg_coef, dst, label, src);
-  MSHADOW_CUDA_POST_KERNEL_CHECK(L2_SVMKernel);
-}
-}  // namespace mshadow
-
-namespace mxnet {
-namespace op {
-template<>
-Operator *CreateOp<gpu>(SVMOutputParam param, int dtype) {
-  Operator *op = nullptr;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new SVMOutputOp<gpu, DType>(param);
-  })
-  return op;
-}
-
-}  // namespace op
-}  // namespace mxnet
-
diff --git a/tests/cpp/thread_safety/thread_safety_test.cc b/tests/cpp/thread_safety/thread_safety_test.cc
index 9566adfd9d13..49bd14d340fe 100644
--- a/tests/cpp/thread_safety/thread_safety_test.cc
+++ b/tests/cpp/thread_safety/thread_safety_test.cc
@@ -647,30 +647,4 @@ TEST(ThreadSafety, Engine) {
   mxnet::test::AssertEqual(output_mx_arr, result_expected, 1e-2, 1e-5);
   mxnet::cpp::NDArray::WaitAll();
 }
-
-TEST(ThreadSafety, CachedOpFullModel) {
-  std::vector<std::string> models_list = {
-      "imagenet1k-resnet-18", "imagenet1k-resnet-152", "imagenet1k-resnet-50"};
-  if (mxnet::test::thread_safety_force_cpu) {
-    models_list.push_back("imagenet1k-resnet-152-subgraph");
-  }
-  for (const auto &model : models_list) {
-    run_inference(model, 1, true, 20);
-    run_inference(model, 2, true, 20);
-    run_inference(model, 4, true, 5);
-    run_inference(model, 4, true, 20);
-    run_inference(model, 4, false, 20);
-    run_inference(model, 8, true, 20);
-    // static_alloc = true
-    run_inference(model, 2, true, 20, true);
-    run_inference(model, 4, true, 5, true);
-    run_inference(model, 4, true, 20, true);
-    run_inference(model, 8, true, 20, true);
-    // static_alloc = true, static_shape = true
-    run_inference(model, 4, true, 20, true, true);
-    run_inference(model, 8, true, 20, true, true);
-    // the below line may hang
-    // run_inference_unsupported(model, 32, false, 20);
-  }
-}
 #endif
diff --git a/tests/jenkins/run_test.sh b/tests/jenkins/run_test.sh
index 48bb4da53fc4..59516d46bcd0 100755
--- a/tests/jenkins/run_test.sh
+++ b/tests/jenkins/run_test.sh
@@ -44,7 +44,6 @@ export PYTHONPATH=$(pwd)/python
 echo "BUILD python_test"
 pytest --verbose tests/python/unittest || exit -1
 pytest --verbose tests/python/gpu/test_operator_gpu.py || exit -1
-pytest --verbose tests/python/gpu/test_forward.py || exit -1
 pytest --verbose tests/python/train || exit -1
 
 echo "BUILD scala_test"
diff --git a/tests/jenkins/run_test_ubuntu.sh b/tests/jenkins/run_test_ubuntu.sh
index 9c3d3c55c852..835bce9aeef7 100755
--- a/tests/jenkins/run_test_ubuntu.sh
+++ b/tests/jenkins/run_test_ubuntu.sh
@@ -56,7 +56,6 @@ export PYTHONPATH=${PWD}/python
 echo "BUILD python_test"
 pytest --verbose tests/python/unittest || exit 1
 pytest --verbose tests/python/gpu/test_operator_gpu.py || exit 1
-pytest --verbose tests/python/gpu/test_forward.py || exit 1
 pytest --verbose tests/python/train || exit 1
 
 echo "BUILD scala_test"
diff --git a/tests/nightly/test_large_array.py b/tests/nightly/test_large_array.py
index 0c9c6f905dd6..377a8709bfbc 100644
--- a/tests/nightly/test_large_array.py
+++ b/tests/nightly/test_large_array.py
@@ -100,36 +100,6 @@ def check_softmax_cross_entropy():
         assert_almost_equal(mx_softmax_cross_entropy.asnumpy(),
                             true_softmax_cross_entropy, rtol=1e-3, atol=1e-5)
 
-    def check_softmax_output():
-        x = mx.sym.Variable('x')
-        label = mx.sym.Variable('label')
-        x_nd = mx.nd.ones((LARGE_X, SMALL_Y))
-        grad_x = mx.nd.zeros((LARGE_X, SMALL_Y))
-        label_nd = mx.nd.ones((LARGE_X))
-        sym = mx.sym.SoftmaxOutput(data=x, label=label, ignore_label=0,
-                                   use_ignore=False)
-
-        ex = sym.bind(ctx=default_context(), args={'x': x_nd, 'label': label_nd},
-                      args_grad=None)
-        ex.forward(is_train=False)
-        softmax_out = ex.outputs[0][0].asnumpy()
-        expected_softmax_out = (1 / SMALL_Y) * mx.nd.ones((SMALL_Y)).asnumpy()
-        assert np.isclose(softmax_out, expected_softmax_out).all()
-
-        ex = sym.bind(ctx=default_context(), args={'x': x_nd, 'label': label_nd},
-                      args_grad={'x': grad_x})
-        ex.forward(is_train=True)
-        softmax_out = ex.outputs[0][0].asnumpy()
-        expected_softmax_out = (1 / SMALL_Y) * mx.nd.ones((SMALL_Y)).asnumpy()
-        assert np.isclose(softmax_out, expected_softmax_out).all()
-
-        ex.backward(is_train=True)
-        grad_out = ex.grad_arrays[0][0].asnumpy()
-        k = int(label_nd[0].asscalar())
-        expected_grad_out = np.zeros((SMALL_Y,))
-        expected_grad_out[k] = -1
-        assert np.isclose(grad_out - softmax_out, expected_grad_out).all()
-
     def check_softmax_activation():
         data = nd.random_normal(shape=(2**29, 2, 2, 2))
         out = nd.random_normal(shape=(2**29, 2, 2, 2))
@@ -358,42 +328,6 @@ def fsigmoid(a):
         ya = fsigmoid(xa)
         check_symbolic_forward(y, [xa], [ya])
 
-    def check_linear_and_logistic_regression():
-        shape = (LARGE_X, SMALL_Y)
-
-        def check_regression(symbol, forward, backward, shape):
-            # init executor
-            data_s = mx.symbol.Variable('data')
-            label_s = mx.symbol.Variable('label')
-            out_s = symbol(data=data_s, label=label_s)
-            grad_req = {'data': 'write', 'label': 'null'}
-            exe = out_s.simple_bind(ctx=default_context(), data=shape, label=shape, grad_req=grad_req)
-            arg_map = dict(zip(out_s.list_arguments(), exe.arg_arrays))
-            grad_map = dict(zip(out_s.list_arguments(), exe.grad_arrays))
-            # init data
-            data = mx.random.uniform(-1, -1, shape)
-            arg_map["data"][:] = data
-            atol = 1e-5
-            density = 0.5
-            stype = 'default'
-            label = arg_map["label"]
-            label[:] = rand_ndarray(shape, stype, density=density)
-            exe.forward(is_train=True)
-            exe.backward()
-            np_out = forward(data.asnumpy())
-            out_grad = backward(np_out, label.asnumpy().reshape(np_out.shape)) / shape[1]
-            assert_almost_equal(exe.outputs[0].asnumpy(), np_out, atol=atol)
-            assert_almost_equal(grad_map["data"].asnumpy(), out_grad, atol=atol)
-
-        check_regression(mx.symbol.LogisticRegressionOutput,
-                         lambda x: 1.0 / (1.0 + np.exp(-x)),
-                         lambda x, y: x - y,
-                         shape)
-        check_regression(mx.symbol.LinearRegressionOutput,
-                         lambda x: x,
-                         lambda x, y: x - y,
-                         shape)
-
     def check_l2_normalization():
         x = nd.ones((2, LARGE_X*2))
         x[0] = 3
@@ -570,7 +504,6 @@ def check_rnn():
     check_dense()
     check_softmax()
     check_softmax_cross_entropy()
-    check_softmax_output()
     check_softmax_activation()
     check_log_softmax()
     check_leaky_relu()
@@ -581,7 +514,6 @@ def check_rnn():
     check_batchnorm()
     check_relu()
     check_sigmoid()
-    check_linear_and_logistic_regression()
     check_l2_normalization()
     check_instance_norm()
     check_col2im()
diff --git a/tests/nightly/test_large_vector.py b/tests/nightly/test_large_vector.py
index 74f015cabf0f..e95b411974b2 100644
--- a/tests/nightly/test_large_vector.py
+++ b/tests/nightly/test_large_vector.py
@@ -45,34 +45,6 @@ def check_dense():
         res = linear(data)
         assert res.shape == (LARGE_X, 2)
 
-    def check_regression():
-        shape = (LARGE_X, )
-        def check_regression(symbol, forward, shape):
-            # init executor
-            data_s = mx.symbol.Variable('data')
-            label_s = mx.symbol.Variable('label')
-            out_s = symbol(data=data_s, label=label_s)
-            exe = out_s.simple_bind(ctx=mx.cpu(0), data=shape, label=shape)
-            arg_map = dict(zip(out_s.list_arguments(), exe.arg_arrays))
-            # init data
-            data = mx.random.uniform(-1, -1, shape)
-            arg_map["data"][:] = data
-            atol = 1e-5
-            density = 0.5
-            stype = 'default'
-            label = arg_map["label"]
-            label[:] = rand_ndarray(shape, stype, density=density)
-            exe.forward(is_train=True)
-            exe.backward()
-            np_out = forward(data.asnumpy())
-            assert_almost_equal(exe.outputs[0].asnumpy(), np_out, atol=atol)
-        check_regression(mx.symbol.LogisticRegressionOutput,
-                         lambda x: 1.0 / (1.0 + np.exp(-x)),
-                         shape)
-        check_regression(mx.symbol.LinearRegressionOutput,
-                         lambda x: x,
-                         shape)
-
     def check_sign():
         a = mx.nd.random.normal(-1, 1, shape=LARGE_X)
         mx_res = mx.nd.sign(a)
@@ -155,7 +127,6 @@ def check_sequence_last():
 
     check_sequence_last()
     check_dense()
-    check_regression()
     check_sign()
     check_layer_norm()
     check_batchnorm()
diff --git a/tests/nightly/test_optimizer.py b/tests/nightly/test_optimizer.py
deleted file mode 100644
index 0cba4d78e539..000000000000
--- a/tests/nightly/test_optimizer.py
+++ /dev/null
@@ -1,52 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import mxnet as mx
-
-import sys
-import os
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.insert(0, os.path.join(curr_path, '../unittest'))
-from common import setup_module, with_seed
-
-# This script is testing the efficiency of LARS
-# We are training LeNet-5 at batch-size 8000 in 10 epochs above 98% accuracy
-# Which is not doable with simple SGD + momentum (from what have been tested so far)
-
-def lenet5():
-    """LeNet-5 Symbol"""
-    #pylint: disable=no-member
-    data = mx.sym.Variable('data')
-    conv1 = mx.sym.Convolution(data=data, kernel=(5, 5), num_filter=20)
-    tanh1 = mx.sym.Activation(data=conv1, act_type="tanh")
-    pool1 = mx.sym.Pooling(data=tanh1, pool_type="max",
-                           kernel=(2, 2), stride=(2, 2))
-    # second conv
-    conv2 = mx.sym.Convolution(data=pool1, kernel=(5, 5), num_filter=50)
-    tanh2 = mx.sym.Activation(data=conv2, act_type="tanh")
-    pool2 = mx.sym.Pooling(data=tanh2, pool_type="max",
-                           kernel=(2, 2), stride=(2, 2))
-    # first fullc
-    flatten = mx.sym.Flatten(data=pool2)
-    fc1 = mx.sym.FullyConnected(data=flatten, num_hidden=500)
-    tanh3 = mx.sym.Activation(data=fc1, act_type="tanh")
-    # second fullc
-    fc2 = mx.sym.FullyConnected(data=tanh3, num_hidden=10)
-    # loss
-    lenet = mx.sym.SoftmaxOutput(data=fc2, name='softmax')
-    #pylint: enable=no-member
-    return lenet
diff --git a/tests/python/common/models.py b/tests/python/common/models.py
index b563adc1d760..8c345000c77e 100644
--- a/tests/python/common/models.py
+++ b/tests/python/common/models.py
@@ -24,23 +24,3 @@ def mlp2():
     out = mx.symbol.Activation(data=out, act_type='relu')
     out = mx.symbol.FullyConnected(data=out, name='fc2', num_hidden=10)
     return out
-
-
-
-def conv():
-    data = mx.symbol.Variable('data')
-    conv1= mx.symbol.Convolution(data = data, name='conv1', num_filter=32, kernel=(3,3), stride=(2,2))
-    bn1 = mx.symbol.BatchNorm(data = conv1, name="bn1")
-    act1 = mx.symbol.Activation(data = bn1, name='relu1', act_type="relu")
-    mp1 = mx.symbol.Pooling(data = act1, name = 'mp1', kernel=(2,2), stride=(2,2), pool_type='max')
-
-    conv2= mx.symbol.Convolution(data = mp1, name='conv2', num_filter=32, kernel=(3,3), stride=(2,2))
-    bn2 = mx.symbol.BatchNorm(data = conv2, name="bn2")
-    act2 = mx.symbol.Activation(data = bn2, name='relu2', act_type="relu")
-    mp2 = mx.symbol.Pooling(data = act2, name = 'mp2', kernel=(2,2), stride=(2,2), pool_type='max')
-
-    fl = mx.symbol.Flatten(data = mp2, name="flatten")
-    fc2 = mx.symbol.FullyConnected(data = fl, name='fc2', num_hidden=10)
-    softmax = mx.symbol.SoftmaxOutput(data = fc2, name = 'sm')
-    return softmax
-
diff --git a/tests/python/gpu/test_contrib_amp.py b/tests/python/gpu/test_contrib_amp.py
index f856c8fb2b1b..0d47db3dfbbc 100644
--- a/tests/python/gpu/test_contrib_amp.py
+++ b/tests/python/gpu/test_contrib_amp.py
@@ -25,7 +25,7 @@
 import ctypes
 import mxnet.contrib.amp as amp
 import pytest
-from mxnet.test_utils import set_default_context, download_model, same_symbol_structure
+from mxnet.test_utils import set_default_context, same_symbol_structure
 from mxnet.gluon.model_zoo.vision import get_model
 from mxnet.gluon import SymbolBlock, nn, rnn
 from mxnet.contrib.amp import amp
@@ -103,180 +103,6 @@ def test_amp_coverage(amp_tests):
                        - If you are not sure which list to choose, FP32_FUNCS is the
                          safest option""")
 
-@with_seed()
-@pytest.mark.garbage_expected
-def test_amp_conversion(amp_tests):
-    def check_amp_convert_symbol():
-        x = mx.sym.var("x")
-        y = mx.sym.var("y")
-        z = mx.sym.FullyConnected(x, y, num_hidden=10, no_bias=True)
-        siny = mx.sym.sin(y)
-        res = z + siny
-        # Compare symbols with similar computation graphs created using convert_symbol and manually.
-        res_converted = amp.convert_symbol(res, target_dtype="float16",
-                                           target_dtype_ops=["FullyConnected"],
-                                           fp32_ops=["sin"])
-
-        x_fp16 = mx.sym.amp_cast(x, dtype="float16")
-        y_fp16 = mx.sym.amp_cast(y, dtype="float16")
-        siny = mx.sym.sin(y)
-        z = mx.sym.FullyConnected(x_fp16, y_fp16, num_hidden=10, no_bias=True)
-        amp_casted_z = mx.sym.amp_cast(z, dtype="float32")
-        res_expected = amp_casted_z + siny
-        assert same_symbol_structure(res_converted, res_expected), \
-            "convert_symbol generating wrong computation graph"
-
-        # convert_symbol called with incorrect inputs
-        pytest.raises(AssertionError, amp.convert_symbol, res,
-                      target_dtype="float16", target_dtype_ops=["FullyConnected"],
-                      fp32_ops=["elemwise_add"])
-        pytest.raises(AssertionError, amp.convert_symbol, res,
-                      target_dtype="float16", target_dtype_ops=["FullyConnected"],
-                      fp32_ops=["Activation"],
-                      conditional_fp32_ops=[('Activation', 'act_type', ['selu'])])
-        pytest.raises(AssertionError, amp.convert_symbol, res,
-                      target_dtype="float16", target_dtype_ops=["Activation"],
-                      fp32_ops=["Activation"],
-                      conditional_fp32_ops=[('Activation', 'act_type', ['selu'])])
-        pytest.raises(AssertionError, amp.convert_symbol, res,
-                      target_dtype="float16", target_dtype_ops=["FullyConnected"],
-                      fp32_ops=["FullyConnected"])
-
-        # Test for op in conditional ops with condition not satisfied
-        x = mx.sym.var("x")
-        y = mx.sym.var("y")
-        fc_cond = mx.sym.FullyConnected(x, y, num_hidden=10, no_bias=True)
-        res_converted = amp.convert_symbol(fc_cond, target_dtype="float16",
-                                           target_dtype_ops=[],
-                                           fp32_ops=["sin"],
-                                           conditional_fp32_ops=[("FullyConnected", "no_bias", ["False"])])
-
-        res_expected = mx.sym.FullyConnected(x, y, num_hidden=10, no_bias=True)
-        assert same_symbol_structure(res_converted, res_expected), \
-            "convert_symbol generating wrong computation graph when conditional ops is used"
-
-        # Test for op in conditional ops with condition satisfied
-        res_converted = amp.convert_symbol(fc_cond, target_dtype="float16", target_dtype_ops=[],
-                                           fp32_ops=["sin"],
-                                           conditional_fp32_ops=[("FullyConnected", "no_bias", ["True"])])
-        x_fp32 = mx.sym.amp_cast(x, dtype="float32")
-        y_fp32 = mx.sym.amp_cast(y, dtype="float32")
-        res_expected = mx.sym.FullyConnected(x_fp32, y_fp32, num_hidden=10, no_bias=True)
-        assert same_symbol_structure(res_converted, res_expected), \
-            "convert_symbol generating wrong computation graph when conditional ops used with satisfying condition"
-
-        # Test with a real world model, default inputs for convert_symbol
-        dir_path = os.path.dirname(os.path.realpath(__file__))
-        model_path = os.path.join(dir_path, 'model')
-        if not os.path.isdir(model_path):
-            os.mkdir(model_path)
-
-        prefix, epoch = download_model("imagenet1k-resnet-18", dst_dir=model_path)
-        sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
-        inputs = {}
-        inputs['data'] = mx.nd.ones((1, 3, 224, 224))
-        inputs.update(arg_params)
-        converted_sym = amp.convert_symbol(sym)
-        exe = converted_sym.simple_bind(mx.gpu(0), data=(1, 3, 224, 224), grad_req='null')
-        exe.forward(is_train=False, **inputs)
-        exe.outputs[0].asnumpy()
-
-        inputs2 = {}
-        inputs2['data'] = mx.nd.ones((1, 3, 224, 224))
-        inputs2['fc1_weight'] = inputs['fc1_weight'].astype(np.float16)
-        inputs2['fc1_bias'] = inputs['fc1_bias'].astype(np.float16)
-
-        # Test with a real world model, tweak inputs for convert_symbol
-        converted_sym = amp.convert_symbol(sym, target_dtype="float16",
-                                           target_dtype_ops=["Convolution"], data_names=["data"],
-                                           cast_optional_params=True)
-        converted_sym2 = amp.convert_symbol(sym, target_dtype="float16",
-                                            target_dtype_ops=["Convolution"], data_names=["data"],
-                                            cast_optional_params=False)
-
-        exe = converted_sym.simple_bind(mx.gpu(0), data=(1, 3, 224, 224), grad_req='null')
-        exe2 = converted_sym2.simple_bind(mx.gpu(), data=(1, 3, 224, 224), grad_req='null')
-
-        converted_args = converted_sym.list_arguments()
-        converted_auxs = converted_sym.list_auxiliary_states()
-        for i, key in enumerate(exe.arg_arrays):
-            if converted_args[i] in arg_params:
-                arg_params[converted_args[i]] = arg_params[converted_args[i]].astype(exe.arg_arrays[i].dtype)
-        for i, key in enumerate(exe.aux_arrays):
-            if converted_auxs[i] in aux_params:
-                aux_params[converted_auxs[i]] = aux_params[converted_auxs[i]].astype(exe.aux_arrays[i].dtype)
-
-        inputs2.update(arg_params)
-        exe.forward(is_train=False, **inputs2)
-        exe.outputs[0].wait_to_read()
-
-        inputs['fc1_weight'] = inputs['fc1_weight'].astype(np.float16)
-        inputs['fc1_bias'] = inputs['fc1_bias'].astype(np.float16)
-        exe2.forward(is_train=False, **inputs)
-        exe2.outputs[0].wait_to_read()
-
-
-    def check_amp_convert_hybrid_block():
-        # Test conversion for hybrid block on CPU
-        model_cpu = get_model("resnet50_v1")
-        model_cpu.collect_params().initialize(ctx=mx.cpu())
-        model_cpu.hybridize()
-        model_cpu(mx.nd.random.uniform(0, 1, shape=(1, 3, 224, 224), ctx=mx.cpu()))
-        converted_model_cpu = amp.convert_hybrid_block(model_cpu)
-
-        # Test with real world model, default inputs for convert_hybrid_block
-        model = get_model("resnet50_v1")
-        model.collect_params().initialize(ctx=mx.gpu())
-        model.hybridize()
-        model(mx.nd.zeros((1, 3, 224, 224)))
-        converted_model = amp.convert_hybrid_block(model)
-        result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224),
-                                                     dtype=np.float32))
-        result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224),
-                                                     dtype=np.float32))
-
-        # Test with real world model, tweak inputs for convert_hybrid_block
-        converted_model = amp.convert_hybrid_block(model, target_dtype="float16",
-                                                   target_dtype_ops=["Convolution"])
-        result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224),
-                                                      dtype=np.float32))
-        result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224),
-                                                     dtype=np.float32))
-
-        # Check symbolic block
-        dir_path = os.path.dirname(os.path.realpath(__file__))
-        model_path = os.path.join(dir_path, 'model')
-        if not os.path.isdir(model_path):
-            os.mkdir(model_path)
-        prefix, epoch = download_model("imagenet1k-resnet-18", dst_dir=model_path)
-        net = SymbolBlock.imports(os.path.join(model_path, "imagenet1k-resnet-18-symbol.json"),
-                                  input_names=["data", "softmax_label"],
-                                  param_file=os.path.join(model_path, "imagenet1k-resnet-18-0000.params"))
-        net.collect_params().reset_ctx(ctx=mx.gpu())
-        net.hybridize()
-        net(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1,)))
-        converted_model = amp.convert_hybrid_block(net)
-        result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1,)))
-        result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1,)))
-
-        # Check symbolic block, tweaked inputs
-        converted_model = amp.convert_hybrid_block(net, target_dtype="float16", target_dtype_ops=["Convolution"])
-        result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1, )))
-        result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1, )))
-        params = converted_model.collect_params()
-        assert params["stage2_unit1_conv2_weight"].dtype == np.float32
-
-        # Pass cast_optional_params as True to convert_hybrid_block
-        converted_model = amp.convert_hybrid_block(net, target_dtype="float16", target_dtype_ops=["Convolution"],
-                                                   cast_optional_params=True)
-        params = converted_model.collect_params()
-        assert params["stage2_unit1_conv2_weight"].dtype == np.float16
-
-
-    with mx.Context(mx.gpu(0)):
-        check_amp_convert_symbol()
-        check_amp_convert_hybrid_block()
-
 @with_seed()
 @pytest.mark.skip(reason='Error during waitall(). Tracked in #18099')
 @assert_raises_cudnn_not_satisfied(min_version='5.1.10')
diff --git a/tests/python/gpu/test_forward.py b/tests/python/gpu/test_forward.py
deleted file mode 100644
index 2ec5ee262e0c..000000000000
--- a/tests/python/gpu/test_forward.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import sys
-import os
-import numpy as np
-import mxnet as mx
-from mxnet.test_utils import *
-curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-sys.path.insert(0, os.path.join(curr_path, '../unittest'))
-from common import setup_module, with_seed, teardown_module
-from mxnet.gluon import utils
-import tarfile
-
-def _get_model():
-    if not os.path.exists('model/Inception-7-symbol.json'):
-        download('http://data.mxnet.io/models/imagenet/inception-v3.tar.gz')
-        with tarfile.open(name="inception-v3.tar.gz", mode="r:gz") as tf:
-            tf.extractall()
-
-def _dump_images(shape):
-    import skimage.io
-    import skimage.transform
-    img_list = []
-    for img in sorted(os.listdir('data/test_images/')):
-        img = skimage.io.imread('data/test_images/'+img)
-        short_egde = min(img.shape[:2])
-        yy = int((img.shape[0] - short_egde) / 2)
-        xx = int((img.shape[1] - short_egde) / 2)
-        img = img[yy : yy + short_egde, xx : xx + short_egde]
-        img = skimage.transform.resize(img, shape)
-        img_list.append(img)
-    imgs = np.asarray(img_list, dtype=np.float32).transpose((0, 3, 1, 2)) - 128
-    np.save('data/test_images_%d_%d.npy'%shape, imgs)
-
-def _get_data(shape):
-    hash_test_img = "355e15800642286e7fe607d87c38aeeab085b0cc"
-    hash_inception_v3 = "91807dfdbd336eb3b265dd62c2408882462752b9"
-    utils.download("http://data.mxnet.io/data/test_images_%d_%d.npy" % (shape),
-                   path="data/test_images_%d_%d.npy" % (shape),
-                   sha1_hash=hash_test_img)
-    utils.download("http://data.mxnet.io/data/inception-v3-dump.npz",
-                   path='data/inception-v3-dump.npz',
-                   sha1_hash=hash_inception_v3)
-
-@with_seed()
-def test_consistency(dump=False):
-    shape = (299, 299)
-    _get_model()
-    _get_data(shape)
-    if dump:
-        _dump_images(shape)
-        gt = None
-    else:
-        gt = {n: mx.nd.array(a) for n, a in np.load('data/inception-v3-dump.npz').items()}
-    data = np.load('data/test_images_%d_%d.npy'%shape)
-    sym, arg_params, aux_params = mx.model.load_checkpoint('model/Inception-7', 1)
-    arg_params['data'] = data
-    arg_params['softmax_label'] = np.random.randint(low=1, high=1000, size=(data.shape[0],))
-    ctx_list = [{'ctx': mx.gpu(0), 'data': data.shape, 'type_dict': {'data': data.dtype}},
-                {'ctx': mx.cpu(0), 'data': data.shape, 'type_dict': {'data': data.dtype}}]
-    gt = check_consistency(sym, ctx_list, arg_params=arg_params, aux_params=aux_params,
-                           tol=1e-3, grad_req='null', raise_on_err=False, ground_truth=gt)
-    if dump:
-        np.savez('data/inception-v3-dump.npz', **{n: a.asnumpy() for n, a in gt.items()})
-
-if __name__ == '__main__':
-    test_consistency(False)
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 6e43559697b0..a1adda5641b1 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -47,8 +47,6 @@
 from test_contrib_optimizer import test_adamw
 
 set_default_context(mx.gpu(0))
-del test_support_vector_machine_l1_svm  # noqa
-del test_support_vector_machine_l2_svm  # noqa
 del test_custom_op_fork  #noqa
 
 def check_countsketch(in_dim,out_dim,n):
@@ -1659,18 +1657,6 @@ def test_embedding_helper(data_types, weight_types, low_pad, high_pad):
     test_embedding_helper(data_types, weight_types, 0, 5)
 
 
-@with_seed()
-def test_svmoutput_with_type():
-    sym = mx.sym.SVMOutput(name='svmoutput', use_linear=True)
-    ctx_list = [{'ctx': mx.gpu(0), 'svmoutput_data': (20, 10), 'type_dict': {'svmoutput_data': np.float64}},
-                {'ctx': mx.gpu(0), 'svmoutput_data': (20, 10), 'type_dict': {'svmoutput_data': np.float32}},
-                {'ctx': mx.gpu(0), 'svmoutput_data': (20, 10), 'type_dict': {'svmoutput_data': np.float16}},
-                {'ctx': mx.cpu(0), 'svmoutput_data': (20, 10), 'type_dict': {'svmoutput_data': np.float64}},
-                {'ctx': mx.cpu(0), 'svmoutput_data': (20, 10), 'type_dict': {'svmoutput_data': np.float32}},
-                {'ctx': mx.cpu(0), 'svmoutput_data': (20, 10), 'type_dict': {'svmoutput_data': np.float16}}]
-    check_consistency(sym, ctx_list, use_uniform=True)
-
-
 @with_seed()
 def test_take_with_type():
     sym = mx.sym.take(name='take')
diff --git a/tests/python/gpu/test_predictor.py b/tests/python/gpu/test_predictor.py
index b1a4d2ef1df6..7992f59f6210 100644
--- a/tests/python/gpu/test_predictor.py
+++ b/tests/python/gpu/test_predictor.py
@@ -27,7 +27,7 @@
 import mxnet.ndarray as nd
 from mxnet.ndarray import NDArray
 from mxnet import gluon
-from mxnet.test_utils import assert_almost_equal, download_model
+from mxnet.test_utils import assert_almost_equal
 from mxnet.contrib.amp import amp
 from mxnet.base import NDArrayHandle, py_str
 sys.path.insert(0, os.path.join(curr_path, '../unittest'))
diff --git a/tests/python/mkl/test_bf16_operator.py b/tests/python/mkl/test_bf16_operator.py
index 4b1f75ec7dc5..a67bfb548796 100644
--- a/tests/python/mkl/test_bf16_operator.py
+++ b/tests/python/mkl/test_bf16_operator.py
@@ -25,7 +25,7 @@
 import ctypes
 import itertools
 import mxnet.contrib.amp as amp
-from mxnet.test_utils import set_default_context, download_model, same_symbol_structure, assert_almost_equal_with_err, rand_shape_nd
+from mxnet.test_utils import set_default_context, same_symbol_structure, assert_almost_equal_with_err, rand_shape_nd
 from mxnet.gluon.model_zoo.vision import get_model
 from mxnet.gluon import SymbolBlock, nn, rnn
 from mxnet.contrib.amp import amp
diff --git a/tests/python/mkl/test_contrib_amp.py b/tests/python/mkl/test_contrib_amp.py
index e63424c46a80..cd85ce6f0bb8 100644
--- a/tests/python/mkl/test_contrib_amp.py
+++ b/tests/python/mkl/test_contrib_amp.py
@@ -25,7 +25,7 @@
 import ctypes
 import mxnet.contrib.amp as amp
 import pytest
-from mxnet.test_utils import set_default_context, download_model, same_symbol_structure, assert_almost_equal
+from mxnet.test_utils import set_default_context, same_symbol_structure, assert_almost_equal
 from mxnet.gluon.model_zoo.vision import get_model
 from mxnet.gluon import SymbolBlock, nn, rnn
 from mxnet.contrib.amp import amp
@@ -95,181 +95,6 @@ def test_amp_coverage():
                        - If you are not sure which list to choose, FP32_FUNCS is the
                          safest option""")
 
-@with_seed()
-def test_amp_conversion():
-    def check_amp_convert_symbol():
-        x = mx.sym.var("x")
-        y = mx.sym.var("y")
-        z = mx.sym.FullyConnected(x, y, num_hidden=10, no_bias=True)
-        siny = mx.sym.sin(y)
-        res = z + siny
-        # Compare symbols with similar computation graphs created using convert_symbol and manually.
-        res_converted = amp.convert_symbol(res, target_dtype="bfloat16",
-                                           target_dtype_ops=["FullyConnected"],
-                                           fp32_ops=["sin"])
-        x_bf16 = mx.sym.amp_cast(x, dtype=bfloat16)
-        y_bf16 = mx.sym.amp_cast(y, dtype=bfloat16)
-        siny = mx.sym.sin(y)
-        z = mx.sym.FullyConnected(x_bf16, y_bf16, num_hidden=10, no_bias=True)
-        amp_casted_z = mx.sym.amp_cast(z, dtype="float32")
-        res_expected = amp_casted_z + siny
-        assert same_symbol_structure(res_converted, res_expected), \
-            "convert_symbol generating wrong computation graph"
-
-        # convert_symbol called with incorrect inputs
-        pytest.raises(AssertionError, amp.convert_symbol, res,
-                      target_dtype="bfloat16", target_dtype_ops=["FullyConnected"],
-                      fp32_ops=["elemwise_add"])
-        pytest.raises(AssertionError, amp.convert_symbol, res,
-                      target_dtype="bfloat16", target_dtype_ops=["FullyConnected"],
-                      fp32_ops=["Activation"],
-                      conditional_fp32_ops=[('Activation', 'act_type', ['selu'])])
-        pytest.raises(AssertionError, amp.convert_symbol, res,
-                      target_dtype="bfloat16", target_dtype_ops=["Activation"],
-                      fp32_ops=["Activation"],
-                      conditional_fp32_ops=[('Activation', 'act_type', ['selu'])])
-        pytest.raises(AssertionError, amp.convert_symbol, res,
-                      target_dtype="bfloat16", target_dtype_ops=["FullyConnected"],
-                      fp32_ops=["FullyConnected"])
-
-        # Test for op in conditional ops with condition not satisfied
-        x = mx.sym.var("x")
-        y = mx.sym.var("y")
-        fc_cond = mx.sym.FullyConnected(x, y, num_hidden=10, no_bias=True)
-        res_converted = amp.convert_symbol(fc_cond, target_dtype="bfloat16",
-                                           target_dtype_ops=[],
-                                           fp32_ops=["sin"],
-                                           conditional_fp32_ops=[("FullyConnected", "no_bias", ["False"])])
-
-        res_expected = mx.sym.FullyConnected(x, y, num_hidden=10, no_bias=True)
-        assert same_symbol_structure(res_converted, res_expected), \
-           "convert_symbol generating wrong computation graph when conditional ops is used"
-
-        # Test for op in conditional ops with condition satisfied
-        res_converted = amp.convert_symbol(fc_cond, target_dtype="bfloat16", target_dtype_ops=[],
-                                           fp32_ops=["sin"],
-                                           conditional_fp32_ops=[("FullyConnected", "no_bias", ["True"])])
-        x_fp32 = mx.sym.amp_cast(x, dtype="float32")
-        y_fp32 = mx.sym.amp_cast(y, dtype="float32")
-        res_expected = mx.sym.FullyConnected(x_fp32, y_fp32, num_hidden=10, no_bias=True)
-        assert same_symbol_structure(res_converted, res_expected), \
-           "convert_symbol generating wrong computation graph when conditional ops used with satisfying condition"
-
-        # Test with a real world model, default inputs for convert_symbol
-        dir_path = os.path.dirname(os.path.realpath(__file__))
-        model_path = os.path.join(dir_path, 'model')
-        if not os.path.isdir(model_path):
-            os.mkdir(model_path)
-
-        prefix, epoch = download_model("imagenet1k-resnet-18", dst_dir=model_path)
-        sym, arg_params, aux_params = mx.model.load_checkpoint(prefix, epoch)
-        inputs = {}
-        inputs['data'] = mx.nd.ones((1, 3, 224, 224))
-        inputs.update(arg_params)
-        converted_sym = amp.convert_symbol(sym, target_dtype="bfloat16")
-        exe = converted_sym.simple_bind(mx.cpu(), data=(1, 3, 224, 224), grad_req='null')
-        exe.forward(is_train=False, **inputs)
-        exe.outputs[0].asnumpy()
-
-        inputs_bf16 = {}
-        inputs_bf16['data'] = mx.nd.ones((1, 3, 224, 224))
-        inputs_bf16['fc1_weight'] = mx.nd.amp_cast(inputs['fc1_weight'], dtype=bfloat16)
-        inputs_bf16['fc1_bias'] = mx.nd.amp_cast(inputs['fc1_bias'], dtype=bfloat16)
-
-        # Test with a real world model, tweak inputs for convert_symbol
-        converted_sym = amp.convert_symbol(sym, target_dtype="bfloat16",
-                                           target_dtype_ops=["Convolution"], data_names=["data"],
-                                           cast_optional_params=True)
-        converted_sym2 = amp.convert_symbol(sym, target_dtype="bfloat16",
-                                            target_dtype_ops=["Convolution"], data_names=["data"],
-                                            cast_optional_params=False)
-
-        exe = converted_sym.simple_bind(mx.cpu(), data=(1, 3, 224, 224), grad_req='null')
-        exe2 = converted_sym2.simple_bind(mx.cpu(), data=(1, 3, 224, 224), grad_req='null')
-
-        converted_args = converted_sym.list_arguments()
-        converted_auxs = converted_sym.list_auxiliary_states()
-        for i, key in enumerate(exe.arg_arrays):
-            if converted_args[i] in arg_params:
-                arg_dtype = exe.arg_arrays[i].dtype
-                if arg_dtype == bfloat16:
-                    arg_params[converted_args[i]] = mx.nd.amp_cast(arg_params[converted_args[i]], dtype=bfloat16)
-                else:
-                    arg_params[converted_args[i]] = arg_params[converted_args[i]].astype(arg_dtype)
-        for i, key in enumerate(exe.aux_arrays):
-            aux_dtype = exe.aux_arrays[i].dtype
-            if converted_auxs[i] in aux_params:
-                if arg_dtype == bfloat16:
-                    aux_params[converted_auxs[i]] = mx.nd.amp_cast(aux_params[converted_auxs[i]], dtype=bfloat16)
-                else:
-                    aux_params[converted_auxs[i]] = aux_params[converted_auxs[i]].astype(aux_dtype)
-
-        inputs_bf16.update(arg_params)
-        exe.forward(is_train=False, **inputs_bf16)
-        exe.outputs[0].wait_to_read()
-
-        exe2.forward(is_train=False, **inputs)
-        exe2.outputs[0].wait_to_read()
-
-    def check_amp_convert_hybrid_block():
-        # Test conversion for hybrid block on CPU
-        model_cpu = get_model("resnet50_v1")
-        model_cpu.collect_params().initialize(ctx=mx.cpu())
-        model_cpu.hybridize()
-        model_cpu(mx.nd.random.uniform(0, 1, shape=(1, 3, 224, 224), ctx=mx.cpu()))
-        converted_model_cpu = amp.convert_hybrid_block(model_cpu, target_dtype="bfloat16", ctx=mx.cpu())
-
-        # Test with real world model, default inputs for convert_hybrid_block
-        model = get_model("resnet50_v1")
-        model.collect_params().initialize(ctx=mx.cpu())
-        model.hybridize()
-        model(mx.nd.zeros((1, 3, 224, 224)))
-        converted_model = amp.convert_hybrid_block(model, target_dtype="bfloat16", ctx=mx.cpu())
-        result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224),
-                                                     dtype=np.float32))
-        result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224),
-                                                     dtype=np.float32))
-
-        # Test with real world model, tweak inputs for convert_hybrid_block
-        converted_model = amp.convert_hybrid_block(model, target_dtype="bfloat16",
-                                                   target_dtype_ops=["Convolution"], ctx=mx.cpu())
-        result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224),
-                                                      dtype=np.float32))
-        result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224),
-                                                     dtype=np.float32))
-
-        # Check symbolic block
-        dir_path = os.path.dirname(os.path.realpath(__file__))
-        model_path = os.path.join(dir_path, 'model')
-        if not os.path.isdir(model_path):
-            os.mkdir(model_path)
-        prefix, epoch = download_model("imagenet1k-resnet-18", dst_dir=model_path)
-        net = SymbolBlock.imports(os.path.join(model_path, "imagenet1k-resnet-18-symbol.json"),
-                                  input_names=["data", "softmax_label"],
-                                  param_file=os.path.join(model_path, "imagenet1k-resnet-18-0000.params"))
-        net.collect_params().reset_ctx(ctx=mx.cpu())
-        net.hybridize()
-        net(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1,)))
-        converted_model = amp.convert_hybrid_block(net, target_dtype="bfloat16", ctx=mx.cpu())
-        result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1,)))
-        result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1,)))
-
-        # Check symbolic block, tweaked inputs
-        converted_model = amp.convert_hybrid_block(net, target_dtype="bfloat16", target_dtype_ops=["Convolution"], ctx=mx.cpu())
-        result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1, )))
-        result = converted_model.forward(mx.nd.zeros((1, 3, 224, 224)), mx.nd.zeros((1, )))
-        params = converted_model.collect_params()
-        assert params["stage2_unit1_conv2_weight"].dtype == np.float32
-
-        # Pass cast_optional_params as True to convert_hybrid_block
-        converted_model = amp.convert_hybrid_block(net, target_dtype="bfloat16", target_dtype_ops=["Convolution"],
-                                                   cast_optional_params=True, ctx=mx.cpu())
-        params = converted_model.collect_params()
-        assert params["stage2_unit1_conv2_weight"].dtype == bfloat16
-
-    check_amp_convert_symbol()
-    check_amp_convert_hybrid_block()
-
 @with_seed()
 def test_bf16_casting():
     data = mx.sym.var("data")
diff --git a/tests/python/mkl/test_mkldnn.py b/tests/python/mkl/test_mkldnn.py
index 6b075dec463c..26ec0818cf4b 100644
--- a/tests/python/mkl/test_mkldnn.py
+++ b/tests/python/mkl/test_mkldnn.py
@@ -31,35 +31,6 @@
 sys.path.append(os.path.join(curr_path, '../unittest/'))
 from common import with_seed
 
-
-def test_mkldnn_model():
-    model = os.path.join(os.path.dirname(os.path.realpath(__file__)), "data",
-                         "test_mkldnn_test_mkldnn_model_model1.json")
-    shape = (32, 3, 300, 300)
-    ctx = mx.cpu()
-
-    sym = mx.sym.load(model)
-    args = sym.list_arguments()
-    shapes = sym.infer_shape(data=shape)
-
-    def get_tensors(args, shapes, ctx):
-        return {x: mx.nd.ones(y, ctx) for x, y in zip(args, shapes)}
-
-    inputs = get_tensors(args, shapes[0], ctx)
-    grads = get_tensors(args, shapes[0], ctx)
-
-    try:
-        exe = sym.bind(ctx, inputs, args_grad=grads)
-        for _ in range(2):
-            exe.forward(is_train=True)
-            for y in exe.outputs:
-                y.wait_to_read()
-            exe.backward()
-            for y in exe.grad_arrays:
-                y.wait_to_read()
-    except:  # pylint: disable=bare-except
-        assert 0, "test_mkldnn_model exception in bind and execution"
-
 @with_seed(1234)
 def test_mkldnn_ndarray_slice():
     ctx = mx.cpu()
diff --git a/tests/python/mkl/test_subgraph.py b/tests/python/mkl/test_subgraph.py
index 9e5d4776c30e..d884e911bd3e 100644
--- a/tests/python/mkl/test_subgraph.py
+++ b/tests/python/mkl/test_subgraph.py
@@ -178,8 +178,6 @@ def conv_act_sum(no_bias, data_shape, alg):
   return sum, attr
 
 
-
-
 # mobilenetv2 case
 def mobilenetv2_struct(data_shape):
   attr = {'sg_mkldnn_conv_bn_0' : {'with_bn': 'true'}}
@@ -195,184 +193,6 @@ def mobilenetv2_struct(data_shape):
   sum = bn1 + bn2
   return sum, attr
 
-def tail_neg_symbol(sym1, sym2):
-  fc1 = mx.sym.FullyConnected(data=sym1, num_hidden=10, flatten=True, name='fc1')
-  fc2 = mx.sym.FullyConnected(data=sym2, num_hidden=10, flatten=True, name='fc2')
-  concat = mx.sym.Concat(*[fc1, fc2], name="concat")
-  sym = mx.sym.SoftmaxOutput(data=concat, name='softmax')
-  return sym
-
-# conv + bn can't be fusion case
-# eg.1
-# conv --------- > bn
-#  |
-#  |
-#  -------------> [custom op]
-def neg_conv_bn(data_shape):
-  syms = []
-  attrs = []
-  excluded_attrs = []
-  data, weight = head_symbol(data_shape)
-
-  # eg.1 ([custom op] = pool)
-  conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64, kernel=(3, 3), stride=(1, 1))
-  bn1 = mx.symbol.BatchNorm(data=conv, name="bn1")
-  pool = mx.sym.Pooling(data=conv, kernel=(4, 4), pool_type='avg', name='pool')
-  sym = tail_neg_symbol(bn1, pool)
-
-  syms.append(sym)
-  attrs.append([])
-  excluded_attrs.append([])
-  return syms, attrs, excluded_attrs
-
-# conv + relu can't be fusion case
-# eg.1
-# conv -----------> relu
-#  |
-#  |
-#  ---------------> [custom op]
-def neg_conv_relu(data_shape):
-  syms = []
-  attrs = []
-  excluded_attrs = []
-  data, weight = head_symbol(data_shape)
-
-  # eg.1 ([custom op] = pool)
-  conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64, kernel=(3, 3), stride=(1, 1))
-  relu = mx.symbol.Activation(data=conv, name='relu', act_type="relu")
-  pool = mx.sym.Pooling(data=conv, kernel=(4, 4), pool_type='avg', name='pool')
-  sym = tail_neg_symbol(relu, pool)
-
-  syms.append(sym)
-  attrs.append([])
-  excluded_attrs.append([])
-  return syms, attrs, excluded_attrs
-
-# conv + add can't be fusion case
-# eg.1
-#  ---------------> [custom op]
-#  |
-#  |
-# conv -----------> add
-#                   |
-#                   |
-# added ------------>
-def neg_conv_add(data_shape):
-  syms = []
-  attrs = []
-  excluded_attrs = []
-  val = mx.symbol.Variable('addval')
-  data, weight = head_symbol(data_shape)
-
-  # eg.1 ([custom op] = pool, [added op] = val)
-  conv = mx.symbol.Convolution(data=data, weight=weight, name='conv', num_filter=64, kernel=(3, 3), stride=(1, 1))
-  sum1 = conv + val
-  pool = mx.sym.Pooling(data=conv, kernel=(4, 4), pool_type='avg', name='pool')
-  sym = tail_neg_symbol(sum1, pool)
-
-  syms.append(sym)
-  attrs.append([])
-  excluded_attrs.append('with_sum')
-  return syms, attrs, excluded_attrs
-
-# conv + bn + relu can't be fusion case
-# eg.1
-#   --------------> [custom op]
-#   |
-# conv -----------> bn -----------> relu
-#
-# eg.2
-#                   --------------> [custom op]
-#                   |
-# conv -----------> bn -----------> relu
-def neg_conv_bn_relu(data_shape):
-  syms = []
-  attrs = []
-  excluded_attrs = []
-  data, weight = head_symbol(data_shape)
-
-  # eg.1 ([custom op] = pool11)
-  conv11 = mx.symbol.Convolution(data=data, weight=weight, name='conv11', num_filter=64, kernel=(3, 3), stride=(1, 1))
-  bn11 = mx.symbol.BatchNorm(data=conv11, name="bn11")
-  relu11 = mx.symbol.Activation(data=bn11, name='relu11', act_type="relu")
-  pool11 = mx.sym.Pooling(data=conv11, kernel=(4, 4), pool_type='avg', name='pool11')
-  sym1 = tail_neg_symbol(relu11, pool11)
-
-  syms.append(sym1)
-  attrs.append([])
-  excluded_attrs.append([])
-
-  # eg.2 ([custom op] = pool)
-  conv21 = mx.symbol.Convolution(data=data, weight=weight, name='conv21', num_filter=64, kernel=(3, 3), stride=(1, 1))
-  bn21 = mx.symbol.BatchNorm(data=conv21, name="bn21")
-  relu21 = mx.symbol.Activation(data=bn21, name='relu21', act_type="relu")
-  pool21 = mx.sym.Pooling(data=bn21, kernel=(4, 4), pool_type='avg', name='pool21')
-  sym2 = tail_neg_symbol(relu21, pool21)
-
-  syms.append(sym2)
-  attrs.append(['with_bn'])
-  excluded_attrs.append(['with_act'])
-  return syms, attrs, excluded_attrs
-
-# conv + bn + add + relu can't be fusion case
-# eg.1
-#   --------------> [custom op]
-#   |
-# conv -----------> bn -----------> add -----------> relu
-#
-# eg.2
-#                    -------------> [custom op]
-#                    |
-# conv -----------> bn -----------> add -----------> relu
-#
-# eg.3
-#                                    --------------> [custom op]
-#                                    |
-# conv -----------> bn -----------> add -----------> relu
-def neg_conv_bn_add_relu(data_shape):
-  syms = []
-  attrs = []
-  excluded_attrs = []
-  addVal = mx.symbol.Variable('addval')
-  data, weight = head_symbol(data_shape)
-
-  # eg.1
-  conv11 = mx.symbol.Convolution(data=data, weight=weight, name='conv11', num_filter=64, kernel=(3, 3), stride=(1, 1))
-  bn11 = mx.symbol.BatchNorm(data=conv11, name="bn11")
-  sum11 = bn11 + addVal
-  relu11 = mx.symbol.Activation(data=sum11, name='relu11', act_type="relu")
-  pool11 = mx.sym.Pooling(data=conv11, kernel=(4, 4), pool_type='avg', name='pool11')
-  sym1 = tail_neg_symbol(relu11, pool11)
-
-  syms.append(sym1)
-  attrs.append([])
-  excluded_attrs.append(['with_sum', 'with_postsum_act', 'with_bn'])
-
-  # eg.2
-  conv21 = mx.symbol.Convolution(data=data, weight=weight, name='conv21', num_filter=64, kernel=(3, 3), stride=(1, 1))
-  bn21 = mx.symbol.BatchNorm(data=conv21, name="bn21")
-  sum21 = bn21 + addVal
-  relu21 = mx.symbol.Activation(data=sum21, name='relu21', act_type="relu")
-  pool21 = mx.sym.Pooling(data=bn21, kernel=(4, 4), pool_type='avg', name='pool21')
-  sym2 = tail_neg_symbol(relu21, pool21)
-
-  syms.append(sym2)
-  attrs.append(['with_bn'])
-  excluded_attrs.append(['with_sum', 'with_postsum_act'])
-
-  # eg.3
-  conv31 = mx.symbol.Convolution(data=data, weight=weight, name='conv31', num_filter=64, kernel=(3, 3), stride=(1, 1))
-  bn31 = mx.symbol.BatchNorm(data=conv31, name="bn31")
-  sum31 = bn31 + addVal
-  relu31 = mx.symbol.Activation(data=sum31, name='relu31', act_type="relu")
-  pool31 = mx.sym.Pooling(data=sum31, kernel=(4, 4), pool_type='avg', name='pool31')
-  sym3 = tail_neg_symbol(relu31, pool31)
-
-  syms.append(sym3)
-  attrs.append(['with_bn', 'with_sum'])
-  excluded_attrs.append(['with_postsum_act'])
-  return syms, attrs, excluded_attrs
-
 def single_fc(no_bias, data_shape, flatten=True):
   attr = {'fc': {}}
   data, weight = head_symbol(data_shape)
@@ -403,68 +223,6 @@ def fc_eltwise(no_bias, data_shape, flatten=True, alg='relu'):
 
   return sym, attr
 
-# fc + relu can't be fusion case
-# eg.1
-# fc -----------> relu
-#  |
-#  |
-#  ---------------> [custom op]
-def neg_fc_relu(no_bias, data_shape, flatten=True):
-  syms = []
-  attrs = []
-  excluded_attrs = []
-  data, weight = head_symbol(data_shape)
-
-  # eg.1 ([custom op] = pool)
-  fc = mx.symbol.FullyConnected(name='fc', data=data, weight=weight, num_hidden=64,
-                                no_bias=no_bias, flatten=flatten)
-  relu = mx.symbol.Activation(data=fc, name='relu', act_type="relu")
-  sigmoid = mx.symbol.Activation(data=fc, name='sigmoid', act_type="sigmoid")
-  sym = tail_neg_symbol(relu, sigmoid)
-
-  syms.append(sym)
-  attrs.append([])
-  excluded_attrs.append([])
-  return syms, attrs, excluded_attrs
-
-@with_seed()
-@pytest.mark.parametrize('data_shape', DATA_SHAPE)
-def test_neg_conv_bn(data_shape):
-    syms, attrs, excluded_attrs = neg_conv_bn(data_shape)
-    check_neg_fusion(syms, attrs, excluded_attrs, data_shape)
-
-@with_seed()
-@pytest.mark.parametrize('data_shape', DATA_SHAPE)
-def test_neg_conv_relu(data_shape):
-    syms, attrs, excluded_attrs = neg_conv_relu(data_shape)
-    check_neg_fusion(syms, attrs, excluded_attrs, data_shape)
-
-@with_seed()
-@pytest.mark.parametrize('data_shape', DATA_SHAPE)
-def test_neg_conv_add(data_shape):
-    syms, attrs, excluded_attrs = neg_conv_add(data_shape)
-    check_neg_fusion(syms, attrs, excluded_attrs, data_shape)
-
-@with_seed()
-@pytest.mark.parametrize('data_shape', DATA_SHAPE)
-def test_neg_conv_bn_relu(data_shape):
-    syms, attrs, excluded_attrs = neg_conv_bn_relu(data_shape)
-    check_neg_fusion(syms, attrs, excluded_attrs, data_shape)
-
-@with_seed()
-@pytest.mark.parametrize('data_shape', DATA_SHAPE)
-def test_neg_conv_bn_add_relu(data_shape):
-    syms, attrs, excluded_attrs = neg_conv_bn_add_relu(data_shape)
-    check_neg_fusion(syms, attrs, excluded_attrs, data_shape)
-
-
-@with_seed()
-@pytest.mark.parametrize('data_shape', DATA_SHAPE)
-@pytest.mark.parametrize('no_bias', [True, False])
-@pytest.mark.parametrize('flatten', [True, False])
-def test_neg_fc_relu(data_shape, no_bias, flatten):
-    syms, attrs, excluded_attrs = neg_fc_relu(no_bias, data_shape, flatten)
-    check_neg_fusion(syms, attrs, excluded_attrs, data_shape, name='fc')
 
 def test_float64_fallback():
     sym = mx.sym.FullyConnected(
diff --git a/tests/python/quantization/test_quantization.py b/tests/python/quantization/test_quantization.py
index 317e8cc65d86..7e01734e61a8 100644
--- a/tests/python/quantization/test_quantization.py
+++ b/tests/python/quantization/test_quantization.py
@@ -743,78 +743,6 @@ def test_quantize_params():
             assert name not in param_names
             assert name.find('quantize') != -1
 
-
-def get_fp32_sym():
-    data = mx.sym.Variable('data')
-    conv = mx.sym.Convolution(data, kernel=(1, 1), num_filter=16, name='conv')
-    bn = mx.sym.BatchNorm(data=conv, eps=2e-05, fix_gamma=False, momentum=0.9, use_global_stats=False, name='bn')
-    act = mx.sym.Activation(data=bn, act_type='relu', name='relu')
-    pool = mx.sym.Pooling(act, kernel=(4, 4), pool_type='avg', name='pool')
-    fc = mx.sym.FullyConnected(pool, num_hidden=10, flatten=True, name='fc')
-    sym = mx.sym.SoftmaxOutput(fc, grad_scale=1, ignore_label=-1, multi_output=False,
-                               out_grad=False, preserve_shape=False, use_ignore=False, name='softmax')
-    return sym
-
-def get_fp32_residual():
-    data = mx.sym.Variable('data')
-    conv0 = mx.sym.Convolution(data=data, num_filter=4, kernel=(1,1), pad=(0,0),
-                               no_bias=True, name='conv0')
-    bn = mx.sym.BatchNorm(data=conv0, fix_gamma=False, eps=2e-5, momentum=0.9, name='bn')
-    sum0 = mx.sym.elemwise_add(bn, data, name='sum0')
-    act0 = mx.sym.Activation(data=sum0, act_type='relu', name='relu0')
-    pool0 = mx.sym.Pooling(act0, kernel=(4, 4), pool_type='avg', name='pool0')
-    conv1 = mx.sym.Convolution(data=pool0, num_filter=4, kernel=(1,1), pad=(0,0),
-                               no_bias=False, name='conv1')
-    act1 = mx.sym.Activation(data=conv1, act_type='relu', name='relu1')
-    pool1 = mx.sym.Pooling(act1, kernel=(4, 4), pool_type='avg', name='pool1')
-    fc = mx.sym.FullyConnected(pool1, num_hidden=10, flatten=True, name='fc')
-    sym = mx.sym.SoftmaxOutput(fc, grad_scale=1, ignore_label=-1, multi_output=False,
-                               out_grad=False, preserve_shape=False, use_ignore=False, name='softmax')
-    return sym
-
-def get_fp32_sym_with_multiple_outputs(length=1):
-    data = mx.sym.Variable('data')
-    inputs = list(mx.sym.split(data, axis=0, num_outputs=length, squeeze_axis=1, name='split'))
-
-    _conv_outs = []
-    for i in range(length):
-        _conv_outs.append(mx.sym.Convolution(data=inputs[i], kernel=(1, 1), num_filter=16, name='conv_{0}'.format(i)))
-    conv_out = [mx.sym.expand_dims(i, axis=0) for i in _conv_outs]
-    conv_out = mx.sym.Concat(*conv_out, dim=0, name='concat')
-    reshape_out = mx.sym.reshape(data=conv_out, shape=((length, -1)), name='reshape')
-    fc_out = mx.sym.FullyConnected(reshape_out, num_hidden=10, flatten=True, name='fc')
-    sym= mx.sym.SoftmaxOutput(fc_out, grad_scale=1, ignore_label=-1, multi_output=False,
-                              out_grad=False, preserve_shape=False, use_ignore=False, name='softmax')
-    return sym
-
-@xfail_when_nonstandard_decimal_separator
-@with_seed()
-def test_quantize_sym_with_calib():
-    if is_test_for_native_cpu():
-        print('skipped testing quantized_pooling for native cpu since it is not supported yet')
-        return
-
-    sym = get_fp32_sym()
-    offline_params = [name for name in sym.list_arguments()
-                      if not name.startswith('data') and not name.endswith('label')]
-    qsym, _ = mx.contrib.quant._quantize_symbol(sym, ctx=mx.current_context(),
-                                             offline_params=offline_params, quantize_mode='full')
-    requantize_op_names = ['requantize_conv', 'requantize_fc']
-    th_dict = {'conv_output': (np.random.uniform(low=100.0, high=200.0), np.random.uniform(low=100.0, high=200.0)),
-               'fc_output': (np.random.uniform(low=100.0, high=200.0), np.random.uniform(low=100.0, high=200.0))}
-    op_name_to_th_name = {'requantize_conv': 'conv_output', 'requantize_fc': 'fc_output'}
-    cqsym = mx.contrib.quant._calibrate_quantized_sym(qsym, th_dict)
-    attr_dict = cqsym.attr_dict()
-    for name in requantize_op_names:
-        assert name in attr_dict
-        lhs = float(attr_dict[name]['min_calib_range'])
-        rhs = th_dict[op_name_to_th_name[name]][0]
-        assert_almost_equal(np.array([lhs]), np.array([rhs]))
-        lhs = float(attr_dict[name]['max_calib_range'])
-        rhs = th_dict[op_name_to_th_name[name]][1]
-        assert_almost_equal(np.array([lhs]), np.array([rhs]), rtol=1e-3, atol=1e-4)
-
-
 @with_seed()
 def test_smooth_distribution():
     assert_exception(lambda: mx.contrib.quant._smooth_distribution(np.zeros((2,)), eps=1e-3), ValueError)
diff --git a/tests/python/tensorrt/test_ops.py b/tests/python/tensorrt/test_ops.py
index 370f028dbdee..7c50c589c967 100644
--- a/tests/python/tensorrt/test_ops.py
+++ b/tests/python/tensorrt/test_ops.py
@@ -262,22 +262,6 @@ def test_pooling2d():
                 check_single_sym(sym, {'data': data_shape}, rtol_fp32=rtol_fp32,
                                  atol_fp32=atol_fp32, rtol_fp16=rtol_fp16, atol_fp16=atol_fp16)
 
-
-@with_seed()
-def test_softmax_output():
-    data = mx.sym.Variable('data')
-    label = mx.sym.Variable('label')
-    data_shape = (8, 100)
-    label_shape = (8, 100)
-    sym = mx.sym.SoftmaxOutput(data, label)
-    check_single_sym(sym, {'data': data_shape, 'label': label_shape},
-                     rtol_fp32=1e-6, atol_fp32=0., rtol_fp16=5e-3, atol_fp16=0.)
-    sym = mx.sym.SoftmaxOutput(data)
-    check_single_sym(sym, {'data': data_shape},
-                     rtol_fp32=1e-6, atol_fp32=0., rtol_fp16=5e-3, atol_fp16=0.)
-
-
-
 def check_batch_norm(sym, data_shapes, arg_params_shapes=None, aux_params_shapes=None,
                      rtol_fp32=1e-5, atol_fp32=1e-7, rtol_fp16=1e-2, atol_fp16=1e-3):
     if arg_params_shapes is None:
diff --git a/tests/python/unittest/onnx/test_node.py b/tests/python/unittest/onnx/test_node.py
index c4c19a47c15b..4c99417f7fa1 100644
--- a/tests/python/unittest/onnx/test_node.py
+++ b/tests/python/unittest/onnx/test_node.py
@@ -148,10 +148,6 @@ def test_exports(self):
     ("test_square", mx.sym.square, "Pow", [get_rnd((2, 3), dtype=np.int32)], {}, True, {}, True, False),
     ("test_spacetodepth", mx.sym.space_to_depth, "SpaceToDepth", [get_rnd((1, 1, 4, 6))],
      {'block_size': 2}, False, {}, True, False),
-    ("test_softmax", mx.sym.SoftmaxOutput, "Softmax", [get_rnd((1000, 1000)), get_rnd(1000)],
-     {'ignore_label': 0, 'use_ignore': False}, True, {}, True, False),
-    ("test_logistic_regression", mx.sym.LogisticRegressionOutput, "Sigmoid",
-     [get_rnd((1000, 1000)), get_rnd((1000, 1000))], {}, True, {}, True, False),
     ("test_fullyconnected", mx.sym.FullyConnected, "Gemm", [get_rnd((4, 3)), get_rnd((4, 3)), get_rnd(4)],
      {'num_hidden': 4, 'name': 'FC'}, True, {}, True, False),
     ("test_lppool1", mx.sym.Pooling, "LpPool", [get_rnd((2, 3, 20, 20))],
diff --git a/tests/python/unittest/resnet.py b/tests/python/unittest/resnet.py
deleted file mode 100644
index be498602f0b7..000000000000
--- a/tests/python/unittest/resnet.py
+++ /dev/null
@@ -1,196 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-'''
-Adapted from https://github.com/tornadomeet/ResNet/blob/master/symbol_resnet.py
-Original author Wei Wu
-
-Implemented the following paper:
-
-Kaiming He, Xiangyu Zhang, Shaoqing Ren, Jian Sun. "Identity Mappings in Deep Residual Networks"
-'''
-import mxnet as mx
-import numpy as np
-
-def residual_unit(data, num_filter, stride, dim_match, name, bottle_neck=True, bn_mom=0.9, workspace=256, memonger=False):
-    """Return ResNet Unit symbol for building ResNet
-    Parameters
-    ----------
-    data : str
-        Input data
-    num_filter : int
-        Number of output channels
-    bnf : int
-        Bottle neck channels factor with regard to num_filter
-    stride : tuple
-        Stride used in convolution
-    dim_match : Boolean
-        True means channel number between input and output is the same, otherwise means differ
-    name : str
-        Base name of the operators
-    workspace : int
-        Workspace used in convolution operator
-    """
-    if bottle_neck:
-        # the same as https://github.com/facebook/fb.resnet.torch#notes, a bit difference with origin paper
-        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn1')
-        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
-        conv1 = mx.sym.Convolution(data=act1, num_filter=int(num_filter*0.25), kernel=(1,1), stride=(1,1), pad=(0,0),
-                                   no_bias=True, workspace=workspace, name=name + '_conv1')
-        bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn2')
-        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
-        conv2 = mx.sym.Convolution(data=act2, num_filter=int(num_filter*0.25), kernel=(3,3), stride=stride, pad=(1,1),
-                                   no_bias=True, workspace=workspace, name=name + '_conv2')
-        bn3 = mx.sym.BatchNorm(data=conv2, fix_gamma=False, eps=2e-5, momentum=bn_mom, name=name + '_bn3')
-        act3 = mx.sym.Activation(data=bn3, act_type='relu', name=name + '_relu3')
-        conv3 = mx.sym.Convolution(data=act3, num_filter=num_filter, kernel=(1,1), stride=(1,1), pad=(0,0), no_bias=True,
-                                   workspace=workspace, name=name + '_conv3')
-        if dim_match:
-            shortcut = data
-        else:
-            shortcut = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
-                                            workspace=workspace, name=name+'_sc')
-        if memonger:
-            shortcut._set_attr(mirror_stage='True')
-        return conv3 + shortcut
-    else:
-        bn1 = mx.sym.BatchNorm(data=data, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn1')
-        act1 = mx.sym.Activation(data=bn1, act_type='relu', name=name + '_relu1')
-        conv1 = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(3,3), stride=stride, pad=(1,1),
-                                      no_bias=True, workspace=workspace, name=name + '_conv1')
-        bn2 = mx.sym.BatchNorm(data=conv1, fix_gamma=False, momentum=bn_mom, eps=2e-5, name=name + '_bn2')
-        act2 = mx.sym.Activation(data=bn2, act_type='relu', name=name + '_relu2')
-        conv2 = mx.sym.Convolution(data=act2, num_filter=num_filter, kernel=(3,3), stride=(1,1), pad=(1,1),
-                                      no_bias=True, workspace=workspace, name=name + '_conv2')
-        if dim_match:
-            shortcut = data
-        else:
-            shortcut = mx.sym.Convolution(data=act1, num_filter=num_filter, kernel=(1,1), stride=stride, no_bias=True,
-                                            workspace=workspace, name=name+'_sc')
-        if memonger:
-            shortcut._set_attr(mirror_stage='True')
-        return conv2 + shortcut
-
-def resnet(units, num_stages, filter_list, num_classes, image_shape, bottle_neck=True, bn_mom=0.9, workspace=256, dtype='float32', memonger=False):
-    """Return ResNet symbol of
-    Parameters
-    ----------
-    units : list
-        Number of units in each stage
-    num_stages : int
-        Number of stage
-    filter_list : list
-        Channel size of each stage
-    num_classes : int
-        Ouput size of symbol
-    dataset : str
-        Dataset type, only cifar10 and imagenet supports
-    workspace : int
-        Workspace used in convolution operator
-    dtype : str
-        Precision (float32 or float16)
-    """
-    num_unit = len(units)
-    assert(num_unit == num_stages)
-    data = mx.sym.Variable(name='data')
-    if dtype == 'float32':
-        data = mx.sym.identity(data=data, name='id')
-    else:
-        if dtype == 'float16':
-            data = mx.sym.Cast(data=data, dtype=np.float16)
-    data = mx.sym.BatchNorm(data=data, fix_gamma=True, eps=2e-5, momentum=bn_mom, name='bn_data')
-    (nchannel, height, width) = image_shape
-    if height <= 32:            # such as cifar10
-        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(3, 3), stride=(1,1), pad=(1, 1),
-                                  no_bias=True, name="conv0", workspace=workspace)
-    else:                       # often expected to be 224 such as imagenet
-        body = mx.sym.Convolution(data=data, num_filter=filter_list[0], kernel=(7, 7), stride=(2,2), pad=(3, 3),
-                                  no_bias=True, name="conv0", workspace=workspace)
-        body = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn0')
-        body = mx.sym.Activation(data=body, act_type='relu', name='relu0')
-        body = mx.sym.Pooling(data=body, kernel=(3, 3), stride=(2,2), pad=(1,1), pool_type='max')
-
-    for i in range(num_stages):
-        body = residual_unit(body, filter_list[i+1], (1 if i==0 else 2, 1 if i==0 else 2), False,
-                             name='stage%d_unit%d' % (i + 1, 1), bottle_neck=bottle_neck, workspace=workspace,
-                             memonger=memonger)
-        for j in range(units[i]-1):
-            body = residual_unit(body, filter_list[i+1], (1,1), True, name='stage%d_unit%d' % (i + 1, j + 2),
-                                 bottle_neck=bottle_neck, workspace=workspace, memonger=memonger)
-    bn1 = mx.sym.BatchNorm(data=body, fix_gamma=False, eps=2e-5, momentum=bn_mom, name='bn1')
-    relu1 = mx.sym.Activation(data=bn1, act_type='relu', name='relu1')
-    # Although kernel is not used here when global_pool=True, we should put one
-    pool1 = mx.sym.Pooling(data=relu1, global_pool=True, kernel=(7, 7), pool_type='avg', name='pool1')
-    flat = mx.sym.Flatten(data=pool1)
-    fc1 = mx.sym.FullyConnected(data=flat, num_hidden=num_classes, name='fc1')
-    if dtype == 'float16':
-        fc1 = mx.sym.Cast(data=fc1, dtype=np.float32)
-    return mx.sym.SoftmaxOutput(data=fc1, name='softmax')
-
-def get_symbol(num_classes, num_layers, image_shape, conv_workspace=256, dtype='float32', **kwargs):
-    """
-    Adapted from https://github.com/tornadomeet/ResNet/blob/master/train_resnet.py
-    Original author Wei Wu
-    """
-    image_shape = [int(l) for l in image_shape.split(',')]
-    (nchannel, height, width) = image_shape
-    if height <= 28:
-        num_stages = 3
-        if (num_layers-2) % 9 == 0 and num_layers >= 164:
-            per_unit = [(num_layers-2)//9]
-            filter_list = [16, 64, 128, 256]
-            bottle_neck = True
-        elif (num_layers-2) % 6 == 0 and num_layers < 164:
-            per_unit = [(num_layers-2)//6]
-            filter_list = [16, 16, 32, 64]
-            bottle_neck = False
-        else:
-            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
-        units = per_unit * num_stages
-    else:
-        if num_layers >= 50:
-            filter_list = [64, 256, 512, 1024, 2048]
-            bottle_neck = True
-        else:
-            filter_list = [64, 64, 128, 256, 512]
-            bottle_neck = False
-        num_stages = 4
-        if num_layers == 18:
-            units = [2, 2, 2, 2]
-        elif num_layers == 34:
-            units = [3, 4, 6, 3]
-        elif num_layers == 50:
-            units = [3, 4, 6, 3]
-        elif num_layers == 101:
-            units = [3, 4, 23, 3]
-        elif num_layers == 152:
-            units = [3, 8, 36, 3]
-        elif num_layers == 200:
-            units = [3, 24, 36, 3]
-        elif num_layers == 269:
-            units = [3, 30, 48, 8]
-        else:
-            raise ValueError("no experiments done on num_layers {}, you can do it yourself".format(num_layers))
-
-    return resnet(units       = units,
-                  num_stages  = num_stages,
-                  filter_list = filter_list,
-                  num_classes = num_classes,
-                  image_shape = image_shape,
-                  bottle_neck = bottle_neck,
-                  workspace   = conv_workspace,
-                  dtype       = dtype)
diff --git a/tests/python/unittest/save_000800.json b/tests/python/unittest/save_000800.json
deleted file mode 100644
index 7b385e2983d8..000000000000
--- a/tests/python/unittest/save_000800.json
+++ /dev/null
@@ -1,188 +0,0 @@
-{
-  "nodes": [
-    {
-      "op": "null", 
-      "param": {}, 
-      "name": "data", 
-      "inputs": [], 
-      "backward_source_id": -1, 
-      "attr": {
-        "ctx_group": "stage1", 
-        "lr_mult": "0.2"
-      }
-    }, 
-    {
-      "op": "null", 
-      "param": {}, 
-      "name": "fc1_weight", 
-      "inputs": [], 
-      "backward_source_id": -1, 
-      "attr": {
-        "ctx_group": "stage1", 
-        "wd_mult": "0.3", 
-        "weight_lr_mult": "1.2"
-      }
-    }, 
-    {
-      "op": "null", 
-      "param": {}, 
-      "name": "fc1_bias", 
-      "inputs": [], 
-      "backward_source_id": -1, 
-      "attr": {
-        "ctx_group": "stage1", 
-        "wd_mult": "0.3", 
-        "weight_lr_mult": "1.2"
-      }
-    }, 
-    {
-      "op": "FullyConnected", 
-      "param": {
-        "no_bias": "False", 
-        "num_hidden": "128"
-      }, 
-      "name": "fc1", 
-      "inputs": [[0, 0], [1, 0], [2, 0]], 
-      "backward_source_id": -1, 
-      "attr": {
-        "ctx_group": "stage1", 
-        "wd_mult": "0.3", 
-        "weight_lr_mult": "1.2"
-      }
-    }, 
-    {
-      "op": "Activation", 
-      "param": {"act_type": "relu"}, 
-      "name": "relu1", 
-      "inputs": [[3, 0]], 
-      "backward_source_id": -1, 
-      "attr": {"ctx_group": "stage1"}
-    }, 
-    {
-      "op": "null", 
-      "param": {}, 
-      "name": "fc2_weight", 
-      "inputs": [], 
-      "backward_source_id": -1, 
-      "attr": {
-        "ctx_group": "stage2", 
-        "lr_mult": "0.01"
-      }
-    }, 
-    {
-      "op": "null", 
-      "param": {}, 
-      "name": "fc2_bias", 
-      "inputs": [], 
-      "backward_source_id": -1, 
-      "attr": {
-        "ctx_group": "stage2", 
-        "lr_mult": "0.01"
-      }
-    }, 
-    {
-      "op": "FullyConnected", 
-      "param": {
-        "no_bias": "False", 
-        "num_hidden": "64"
-      }, 
-      "name": "fc2", 
-      "inputs": [[4, 0], [5, 0], [6, 0]], 
-      "backward_source_id": -1, 
-      "attr": {
-        "ctx_group": "stage2", 
-        "lr_mult": "0.01"
-      }
-    }, 
-    {
-      "op": "Activation", 
-      "param": {"act_type": "relu"}, 
-      "name": "relu2", 
-      "inputs": [[7, 0]], 
-      "backward_source_id": -1, 
-      "attr": {"ctx_group": "stage2"}
-    }, 
-    {
-      "op": "null", 
-      "param": {}, 
-      "name": "fc3_weight", 
-      "inputs": [], 
-      "backward_source_id": -1, 
-      "attr": {"ctx_group": "stage2"}
-    }, 
-    {
-      "op": "null", 
-      "param": {}, 
-      "name": "fc3_bias", 
-      "inputs": [], 
-      "backward_source_id": -1, 
-      "attr": {"ctx_group": "stage2"}
-    }, 
-    {
-      "op": "FullyConnected", 
-      "param": {
-        "no_bias": "False", 
-        "num_hidden": "10"
-      }, 
-      "name": "fc3", 
-      "inputs": [[8, 0], [9, 0], [10, 0]], 
-      "backward_source_id": -1, 
-      "attr": {"ctx_group": "stage2"}
-    }, 
-    {
-      "op": "null", 
-      "param": {}, 
-      "name": "batchnorm0_gamma", 
-      "inputs": [], 
-      "backward_source_id": -1, 
-      "attr": {"ctx_group": "stage2"}
-    }, 
-    {
-      "op": "null", 
-      "param": {}, 
-      "name": "batchnorm0_beta", 
-      "inputs": [], 
-      "backward_source_id": -1, 
-      "attr": {"ctx_group": "stage2"}
-    }, 
-    {
-      "op": "BatchNorm", 
-      "param": {
-        "eps": "0.001", 
-        "fix_gamma": "True", 
-        "momentum": "0.9", 
-        "use_global_stats": "False"
-      }, 
-      "name": "batchnorm0", 
-      "inputs": [[11, 0], [12, 0], [13, 0]], 
-      "backward_source_id": -1, 
-      "attr": {"ctx_group": "stage2"}
-    }, 
-    {
-      "op": "null", 
-      "param": {}, 
-      "name": "softmax_label", 
-      "inputs": [], 
-      "backward_source_id": -1, 
-      "attr": {"ctx_group": "stage2"}
-    }, 
-    {
-      "op": "SoftmaxOutput", 
-      "param": {
-        "grad_scale": "1", 
-        "ignore_label": "-1", 
-        "multi_output": "False", 
-        "normalization": "null", 
-        "out_grad": "False", 
-        "preserve_shape": "False", 
-        "use_ignore": "False"
-      }, 
-      "name": "softmax", 
-      "inputs": [[14, 0], [15, 0]], 
-      "backward_source_id": -1, 
-      "attr": {"ctx_group": "stage2"}
-    }
-  ], 
-  "arg_nodes": [0, 1, 2, 5, 6, 9, 10, 12, 13, 15], 
-  "heads": [[16, 0]]
-}
\ No newline at end of file
diff --git a/tests/python/unittest/test_memory_opt.py b/tests/python/unittest/test_memory_opt.py
index ae4bd3bf1e9a..af671fe468e7 100644
--- a/tests/python/unittest/test_memory_opt.py
+++ b/tests/python/unittest/test_memory_opt.py
@@ -167,31 +167,6 @@ def grep_exec_memory_consumption(exec):
                   "debug string: %s" % exec_debug_str
 
 
-@memory_opt_env_check
-def test_resnet152():
-    # Verify the memory allocation behavior on ResNet-152, the state-of-the-art
-    # model used for image classification.
-
-    import resnet
-    resnet_152 = resnet.get_symbol(num_classes=1000,
-                                   num_layers=152,
-                                   image_shape='3,224,224')
-    # We do the binding twice, one with the memory optimizations and one without.
-    # It is expected that the memory consumption of the former should be roughly
-    # half of that of the latter.
-    memory_opt_exec = resnet_152.simple_bind(mx.cpu(), 'write',
-                                             data=(32, 3, 224, 224))
-    os.environ["MXNET_MEMORY_OPT"] = '0'
-    no_opt_exec = resnet_152.simple_bind(mx.cpu(), 'write', data=(32, 3, 224, 224))
-    os.environ["MXNET_MEMORY_OPT"] = '1'
-    memory_opt_alloc = grep_exec_memory_consumption(memory_opt_exec)
-    no_opt_alloc = grep_exec_memory_consumption(no_opt_exec)
-    assert memory_opt_alloc / no_opt_alloc < 0.6, \
-           "The ratio between the memory consumption with the memory optimizations " \
-           "enabled and disabled (%d vs. %d MB) is expected to be smaller than 0.6"  \
-           % (memory_opt_alloc, no_opt_alloc)
-
-
 if __name__ == "__main__":
     import nose
     nose.runmodule()
diff --git a/tests/python/unittest/test_multi_device_exec.py b/tests/python/unittest/test_multi_device_exec.py
deleted file mode 100644
index aa279b183722..000000000000
--- a/tests/python/unittest/test_multi_device_exec.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import os
-import numpy as np
-import mxnet as mx
-
-def test_ctx_group():
-    def check_ctx_group(group2ctx, grad_req, mlp, set_stage1):
-        texec = mlp.simple_bind(mx.cpu(0),
-                                group2ctx=group2ctx,
-                                data=(1,200), grad_req=grad_req)
-
-        for arr, name in zip(texec.arg_arrays, mlp.list_arguments()):
-            if name in set_stage1:
-                assert arr.context == group2ctx['stage1']
-            else:
-                assert arr.context == group2ctx['stage2']
-
-    with mx.AttrScope(ctx_group='stage1'):
-        data = mx.symbol.Variable('data')
-        fc1  = mx.symbol.FullyConnected(data = data, name='fc1', num_hidden=128)
-        act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
-
-    set_stage1 = set(act1.list_arguments())
-    with mx.AttrScope(ctx_group='stage2'):
-        fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64)
-        act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
-        fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=10)
-        fc3 = mx.symbol.BatchNorm(fc3)
-        mlp  = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
-
-    set_stage2 = set(mlp.list_arguments()) - set_stage1
-
-    group2ctx = {
-        'stage1' : mx.cpu(1),
-        'stage2' : mx.cpu(2)
-    }
-
-    # generate reqs with null
-    grad_req_with_null = {}
-    for arg in mlp.list_arguments():
-        grad_req_with_null[arg] = 'null' if arg == 'data' else 'write'
-
-    grad_reqs = ['write', grad_req_with_null]
-    for grad_req in grad_reqs:
-        check_ctx_group(group2ctx, grad_req, mlp, set_stage1)
-
-def test_ctx_group_sparse():
-    with mx.AttrScope(ctx_group='stage1'):
-        lhs = mx.symbol.Variable('lhs', stype='csr')
-        rhs = mx.symbol.Variable('rhs', stype='row_sparse')
-        dot  = mx.symbol.dot(lhs, rhs, name='dot')
-
-    set_stage1 = set(dot.list_arguments())
-    with mx.AttrScope(ctx_group='stage2'):
-        softmax  = mx.symbol.SoftmaxOutput(data = dot, name = 'softmax')
-
-    set_stage2 = set(softmax.list_arguments()) - set_stage1
-
-    group2ctx = {
-        'stage1' : mx.cpu(1),
-        'stage2' : mx.cpu(2)
-    }
-    texec = softmax.simple_bind(mx.cpu(0), group2ctx=group2ctx,
-                                lhs=(32,200), rhs=(200, 5))
-
-    for arr, name in zip(texec.arg_arrays, softmax.list_arguments()):
-        if name in set_stage1:
-            assert arr.context == group2ctx['stage1']
-        else:
-            assert arr.context == group2ctx['stage2']
-
-if __name__ == '__main__':
-    test_ctx_group()
-    test_ctx_group_sparse()
diff --git a/tests/python/unittest/test_operator.py b/tests/python/unittest/test_operator.py
index 59a7e2620096..cc378b0135e9 100644
--- a/tests/python/unittest/test_operator.py
+++ b/tests/python/unittest/test_operator.py
@@ -349,157 +349,6 @@ def check_slice_channel(data_ndim, axis, num_outputs, squeeze_axis):
     check_slice_channel(data_ndim=3, axis=-1, num_outputs=2, squeeze_axis=False)
     check_slice_channel(data_ndim=5, axis=-2, num_outputs=3, squeeze_axis=True)
 
-@with_seed()
-def test_regression():
-    ''' test regression operator '''
-    def check_regression(symbol, forward, backward, shape, stype='default', densities=[0, 0.5, 1]):
-        # init executor
-        data = mx.symbol.Variable('data')
-        label = mx.symbol.Variable('label', stype=stype)
-        out = symbol(data, label)
-        grad_req = {'data': 'write', 'label': 'null'}
-        out_exec = out.simple_bind(default_context(), grad_req=grad_req,
-            data=shape, label=shape)
-        arg_map = dict(zip(out.list_arguments(), out_exec.arg_arrays))
-        grad_map = dict(zip(out.list_arguments(), out_exec.grad_arrays))
-        # init data
-        arr_data = mx.random.uniform(-1, 1, shape)
-        arg_map["data"][:] = arr_data
-        # init label based on density
-        arr_label = arg_map["label"]
-        atol = 1e-5
-        for density in densities:
-            arr_label[:] = rand_ndarray(shape, stype, density=density)
-            out_exec.forward(is_train=True)
-            out_exec.backward()
-            np_out = forward(arr_data.asnumpy())
-            out_grad = backward(np_out, arr_label.asnumpy().reshape(np_out.shape)) / shape[1]
-            assert_almost_equal(out_exec.outputs[0], np_out, atol=atol)
-            assert_almost_equal(grad_map["data"], out_grad, atol=atol)
-
-    shape = (50, 30)
-
-    check_regression(mx.symbol.LogisticRegressionOutput,
-                     lambda x: 1.0 / (1.0 + np.exp(-x)),
-                     lambda x, y : x - y,
-                     shape)
-    check_regression(mx.symbol.LinearRegressionOutput,
-                     lambda x: x,
-                     lambda x, y : x - y,
-                     shape)
-    check_regression(mx.symbol.MAERegressionOutput,
-                     lambda x: x,
-                     lambda x, y : np.where(x > y, np.ones(x.shape), -np.ones(x.shape)),
-                     shape)
-    check_regression(mx.symbol.LogisticRegressionOutput,
-                     lambda x: 1.0 / (1.0 + np.exp(-x)),
-                     lambda x, y : x - y,
-                     shape, stype='csr')
-    check_regression(mx.symbol.LinearRegressionOutput,
-                     lambda x: x,
-                     lambda x, y : x - y,
-                     shape, stype='csr')
-
-
-def check_softmax_grad(xpu):
-    x = mx.sym.Variable('x')
-    label = mx.sym.Variable('label')
-    x_nd = mx.nd.array([[1, 6, 4, 2]], ctx=xpu)
-    grad_x = mx.nd.zeros((1,4), ctx=xpu)
-    label_nd = mx.nd.array([1], ctx=xpu)
-
-    sym = mx.sym.SoftmaxOutput(data=x, label=label, ignore_label=0, use_ignore=False)
-    ex = sym.bind(ctx=xpu, args={'x': x_nd, 'label': label_nd}, args_grad={'x': grad_x})
-
-    ex.forward(is_train=True)
-    softmax_out = ex.outputs[0].asnumpy()
-    expected_softmax_out = [[0.005806628, 0.861780069, 0.116629249, 0.015784052]]
-    assert np.isclose(softmax_out, expected_softmax_out).all()
-
-    ex.backward(is_train=True)
-    grad_out = ex.grad_arrays[0].asnumpy()
-    k = int(label_nd[0].asscalar())
-    expected_grad_out = np.zeros((1,4))
-    expected_grad_out[0, k] = -1
-    assert np.isclose(grad_out - softmax_out, expected_grad_out).all()
-
-
-def check_smoothed_softmax_grad(xpu):
-    alpha = 0.2
-    x = mx.sym.Variable('x')
-    label = mx.sym.Variable('label')
-    x_nd = mx.nd.array([[1, 6, 4, 2]], ctx=xpu)
-    grad_x = mx.nd.zeros((1,4), ctx=xpu)
-    label_nd = mx.nd.array([1], ctx=xpu)
-
-    sym = mx.sym.SoftmaxOutput(data=x, label=label, ignore_label=0, use_ignore=False, smooth_alpha=alpha)
-    ex = sym.bind(ctx=xpu, args={'x': x_nd, 'label': label_nd}, args_grad={'x': grad_x})
-
-    ex.forward(is_train=True)
-    softmax_out = ex.outputs[0].asnumpy()
-    expected_softmax_out = [[0.005806628, 0.861780069, 0.116629249, 0.015784052]]
-    assert np.isclose(softmax_out, expected_softmax_out).all()
-
-    ex.backward(is_train=True)
-    grad_out = ex.grad_arrays[0].asnumpy()
-    k = int(label_nd[0].asscalar())
-    expected_grad_out = np.full((1,4), fill_value=-alpha/float(4-1))
-    expected_grad_out[0, k] = - (1 - alpha)
-    assert np.isclose(grad_out - softmax_out, expected_grad_out).all()
-
-
-def check_softmax_with_ignore_label(xpu):
-    X = mx.symbol.Variable('X')
-    L = mx.symbol.Variable('L')
-    Y = mx.symbol.SoftmaxOutput(data=X, label=L, ignore_label=0, use_ignore=True)
-
-    shape = (20, 10)
-    x = mx.nd.empty(shape, ctx = xpu)
-    l = mx.nd.empty((shape[0],), ctx = xpu)
-    x_np = np.random.rand(*shape)
-    l_np = np.random.randint(0, shape[1]-1, (shape[0],))
-    x[:] = x_np
-    l[:] = l_np
-
-    grad = mx.nd.empty(shape, ctx = xpu)
-
-    exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad})
-    exec1.forward(is_train=True)
-    exec1.backward()
-
-    grad0 = grad.asnumpy()
-
-    for i in range(int(shape[0]/2)):
-        l_np[i] = 0
-    l[:] = l_np
-
-    exec1.forward(is_train=True)
-    exec1.backward()
-    grad1 = grad.asnumpy()
-
-    assert abs(np.sum(grad1[:int(shape[0]/2)])) < 1e-5
-    assert_almost_equal(grad0[int(shape[0]/2):], grad1[int(shape[0]/2):])
-
-
-def check_softmax_with_shape(shape, xpu, preserve_shape=False):
-    # bind with label
-    X = mx.symbol.Variable('X')
-    L = mx.symbol.Variable('L')
-    Y = mx.symbol.SoftmaxOutput(data=X, label=L, preserve_shape=preserve_shape)
-    x = mx.random.uniform(-1, 1, shape, ctx=xpu)
-    l = mx.random.uniform(-1, 1, shape, ctx=xpu)
-    l[:] = np_softmax(l.asnumpy())
-    grad = mx.nd.empty(shape, ctx = xpu)
-    exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad})
-    exec1.forward(is_train=True)
-    out = exec1.outputs[0].asnumpy()
-    # Non-zero atol required by test_softmax with seed 781663739
-    rtol = 1e-4
-    atol = 1e-6
-    assert_almost_equal(out, np_softmax(x.asnumpy()), rtol=rtol, atol=atol)
-    exec1.backward()
-    assert_almost_equal(grad, np_softmax(x.asnumpy()) - l.asnumpy(), rtol=rtol, atol=atol)
-
 
 def test_python_op():
     X = mx.symbol.Variable('X')
@@ -3386,67 +3235,6 @@ def test_infer_type(dtype):
         unittest_correlation((5,1,11,11), kernel_size = 5,max_displacement = 1,stride1 = 1,stride2 = 1,pad_size = 2,is_multiply = False, dtype = dtype)
 
 
-@with_seed()
-def test_support_vector_machine_l1_svm():
-    xpu = default_context()
-    shape = (20, 10)
-
-    X = mx.symbol.Variable('X')
-    L = mx.symbol.Variable('L')
-    Y = mx.symbol.SVMOutput(data=X, label=L, use_linear=True)
-    x = mx.nd.empty(shape, ctx = xpu)
-    l = mx.nd.empty((shape[0],), ctx = xpu)
-    x_np = np.random.rand(*shape)
-    l_np = np.random.randint(0, shape[1], (shape[0],))
-    x[:] = x_np
-    l[:] = l_np
-
-    grad = mx.nd.empty(shape, ctx = xpu)
-    exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad})
-    exec1.forward(is_train=True)
-
-    assert_almost_equal(x_np, exec1.outputs[0])
-
-    exec1.backward()
-
-    l_mask = np.equal(l_np.reshape(shape[0],1),range(shape[1]))
-    l_mask = np.array(l_mask, dtype=np.float32)*2 -1
-    grad_np = (-1) * l_mask * np.greater(1 - l_mask * x_np, 0)
-
-    assert_almost_equal(grad_np, grad)
-
-
-@with_seed()
-def test_support_vector_machine_l2_svm():
-    xpu = default_context()
-    shape = (20, 10)
-
-    X = mx.symbol.Variable('X')
-    L = mx.symbol.Variable('L')
-    Y = mx.symbol.SVMOutput(data=X, label=L)
-    x = mx.nd.empty(shape, ctx = xpu)
-    l = mx.nd.empty((shape[0],), ctx = xpu)
-    x_np = np.random.rand(*shape)
-    x_np = x_np.astype(np.float32)
-    l_np = np.random.randint(0, shape[1], (shape[0],))
-    x[:] = x_np
-    l[:] = l_np
-
-    grad = mx.nd.empty(shape, ctx = xpu)
-    exec1 = Y.bind(xpu, args = [x, l], args_grad = {'X': grad})
-    exec1.forward(is_train=True)
-
-    assert_almost_equal(x_np, exec1.outputs[0])
-
-    exec1.backward()
-
-    l_mask = np.equal(l_np.reshape(shape[0],1),range(shape[1]))
-    l_mask = np.array(l_mask, dtype=np.float32)*2 -1
-    grad_np = (-2)*l_mask*np.maximum(1-l_mask*x_np,0)
-    grad_np = grad_np.astype(np.float32)
-    assert_almost_equal(grad_np, grad)
-
-
 # Seed set because the test is not robust enough to operate on random data
 @with_seed(1234)
 def test_roipooling():
@@ -7341,86 +7129,6 @@ def test_binary_math_operators():
             name, op[0], shape, op[4], op[5], op[6], op[7], rtol_fd, atol_fd,
             num_eps)
 
-
-@with_seed()
-def test_softmax():
-    check_softmax_with_shape((3, 4), default_context(), preserve_shape=False)
-    check_softmax_with_shape((3, 4), default_context(), preserve_shape=True)
-    check_softmax_with_shape((3, 4, 2), default_context(), preserve_shape=True)
-    check_softmax_grad(default_context())
-    check_smoothed_softmax_grad(default_context())
-
-
-@xfail_when_nonstandard_decimal_separator
-@with_seed()
-def test_softmax_output_normalization():
-    def _softmaxoutput_normalization(multi_output, use_ignore, normalization):
-        grad_scale = np.random.random()
-        batch_size = 8
-        num_labels = 6
-        H, W = 3, 3
-        ignore_label = np.random.randint(0, num_labels) if use_ignore else -1
-
-        if multi_output:
-            data_shape = (batch_size, num_labels, H, W)
-            label_shape = (batch_size, H, W)
-        else:
-            data_shape = (batch_size, num_labels)
-            label_shape = (batch_size, )
-
-        data = mx.nd.random.uniform(-1, 1, shape=data_shape)
-        label = mx.nd.random.randint(
-            0, num_labels, shape=label_shape).astype('float32')
-        data.attach_grad()
-
-        kwargs = dict(grad_scale=grad_scale,
-                      normalization=normalization, multi_output=multi_output)
-        if use_ignore:
-            kwargs.update(use_ignore=True, ignore_label=ignore_label)
-
-        with mx.autograd.record():
-            out = mx.nd.SoftmaxOutput(data=data, label=label, **kwargs)
-        out.backward(mx.nd.ones_like(data))
-
-        exp_data = mx.nd.exp(data)
-        softmax_data = exp_data / exp_data.sum(1, keepdims=True)
-        argmax_data = mx.nd.argmax(data, axis=1)
-
-        assert_almost_equal(out.asnumpy(), softmax_data.asnumpy())
-        one_hot_label = mx.nd.one_hot(label, num_labels)
-        if multi_output:
-            one_hot_label = one_hot_label.transpose((0, 3, 1, 2))
-        data_grad = softmax_data - one_hot_label
-
-        if use_ignore:
-            if multi_output:
-                data_grad *= (label !=
-                              ignore_label).reshape((batch_size, 1, H, W))
-            else:
-                data_grad *= (label != ignore_label).reshape((batch_size, 1))
-
-        valid_cnt = 1
-        if normalization == 'batch':
-            valid_cnt = batch_size
-        elif normalization == 'valid':
-            valid_cnt = mx.nd.maximum(1, (label != ignore_label).sum())
-        scale = grad_scale / valid_cnt
-
-        if multi_output:
-            if normalization != 'valid':
-                scale /= H * W
-
-        data_grad *= scale
-
-        assert_almost_equal(data.grad.asnumpy(), data_grad.asnumpy())
-
-    for multi_output in [False, True]:
-        for use_ignore in [False, True]:
-            for normalization in ['null', 'batch', 'valid']:
-                _softmaxoutput_normalization(
-                    multi_output, use_ignore, normalization)
-
-
 @with_seed()
 @pytest.mark.serial
 def test_slice():
diff --git a/tests/python/unittest/test_sparse_operator.py b/tests/python/unittest/test_sparse_operator.py
index 7b649b1f4362..a88ff2e5a05a 100644
--- a/tests/python/unittest/test_sparse_operator.py
+++ b/tests/python/unittest/test_sparse_operator.py
@@ -1728,24 +1728,6 @@ def np_softmax(x, axis=-1):
         x /= np.sum(x, axis=axis, keepdims=True)
         return x
 
-    def check_softmax_with_shape(lhs_stype, rhs_stype, shape, preserve_shape=False):
-        # bind with label
-        ctx = default_context()
-        X = mx.symbol.Variable('X', stype=lhs_stype)
-        L = mx.symbol.Variable('L', stype=rhs_stype)
-        Y = mx.symbol.SoftmaxOutput(data=X, label=L, preserve_shape=preserve_shape)
-        x = rand_ndarray(shape, lhs_stype)
-        l = rand_ndarray(shape, rhs_stype)
-        l[:] = np_softmax(l.asnumpy())
-        grad = mx.nd.empty(shape, ctx=ctx)
-        exec1 = Y.bind(ctx, args = [x, l], args_grad = {'X': grad})
-        exec1.forward(is_train=True)
-        out = exec1.outputs[0].asnumpy()
-        assert_almost_equal(out, np_softmax(x.asnumpy()), rtol=1e-4)
-        exec1.backward()
-        assert_almost_equal(grad.asnumpy(), np_softmax(x.asnumpy()) - l.asnumpy(),
-                            rtol=1e-3, atol=1e-4)
-
     def check_concat(shape, lhs_stype, rhs_stype):
         x = mx.symbol.Variable('x', stype=lhs_stype)
         w = mx.symbol.Variable('w', stype=rhs_stype)
@@ -1769,8 +1751,6 @@ def check_operator_with_temp_resource(shape, stype):
         for rhs in stypes:
             check_broadcast_add(shape, lhs, rhs)
             check_concat(shape, lhs, rhs)
-            check_softmax_with_shape(lhs, rhs, shape, preserve_shape=False)
-            check_softmax_with_shape(rhs, rhs, shape, preserve_shape=True)
 
 
 @with_seed()
diff --git a/tests/python/unittest/test_symbol.py b/tests/python/unittest/test_symbol.py
index b5205787d1ba..5bfc4be6f324 100644
--- a/tests/python/unittest/test_symbol.py
+++ b/tests/python/unittest/test_symbol.py
@@ -97,7 +97,7 @@ def test_symbol_children():
     assert sliced.get_children().list_outputs() == ['data']
 
 def test_symbol_pickle():
-    mlist = [models.mlp2(), models.conv()]
+    mlist = [models.mlp2()]
     data = pkl.dumps(mlist)
     mlist2 = pkl.loads(data)
     for x, y  in zip(mlist, mlist2):
@@ -113,24 +113,6 @@ def test_symbol_saveload():
     assert sym.tojson() == data2.tojson()
     os.remove(fname)
 
-def test_symbol_infer_type():
-    data = mx.symbol.Variable('data')
-    f32data = mx.symbol.Cast(data=data, dtype='float32')
-    fc1  = mx.symbol.FullyConnected(data = f32data, name='fc1', num_hidden=128)
-    mlp  = mx.symbol.SoftmaxOutput(data = fc1, name = 'softmax')
-
-    arg, out, aux = mlp.infer_type(data=np.float16)
-    assert arg == [np.float16, np.float32, np.float32, np.float32]
-    assert out == [np.float32]
-    assert aux == []
-
-    # partial infer type
-    arg, out, aux = mlp.infer_type_partial()
-    assert arg == [None, np.float32, np.float32, np.float32]
-    assert out == [np.float32]
-    assert aux == []
-
-
 def test_symbol_infer_shape():
     num_hidden = 128
     num_dim    = 64
@@ -272,38 +254,6 @@ def check_symbol_consistency(sym1, sym2, ctx, skip_grad=False, equal_nan=False):
                                     grad_req='null' if skip_grad else 'write',
                                     equal_nan=equal_nan)
 
-def test_load_000800():
-    with mx.AttrScope(ctx_group='stage1'):
-        data = mx.symbol.Variable('data', lr_mult=0.2)
-        weight = mx.sym.Variable(name='fc1_weight', lr_mult=1.2)
-        fc1  = mx.symbol.FullyConnected(data = data, weight=weight, name='fc1', num_hidden=128, wd_mult=0.3)
-        act1 = mx.symbol.Activation(data = fc1, name='relu1', act_type="relu")
-
-    set_stage1 = set(act1.list_arguments())
-    with mx.AttrScope(ctx_group='stage2'):
-        fc2  = mx.symbol.FullyConnected(data = act1, name = 'fc2', num_hidden = 64, lr_mult=0.01)
-        act2 = mx.symbol.Activation(data = fc2, name='relu2', act_type="relu")
-        fc3  = mx.symbol.FullyConnected(data = act2, name='fc3', num_hidden=10)
-        fc3 = mx.symbol.BatchNorm(fc3, name='batchnorm0')
-        sym1  = mx.symbol.SoftmaxOutput(data = fc3, name = 'softmax')
-
-    curr_path = os.path.dirname(os.path.abspath(os.path.expanduser(__file__)))
-    sym2 = mx.sym.load(os.path.join(curr_path, 'save_000800.json'))
-
-    attr1 = sym1.attr_dict()
-    attr2 = sym2.attr_dict()
-    for k, v1 in attr1.items():
-        assert k in attr2, k
-        v2 = attr2[k]
-        for kk, vv1 in v1.items():
-            if kk.startswith('__') and kk.endswith('__') and \
-               kk != '__profiler_scope__':
-                assert kk in v2 and v2[kk] == vv1, k + str(v1) + str(v2)
-
-    check_symbol_consistency(sym1, sym2,
-        {'ctx': mx.cpu(0), 'group2ctx': {'stage1' : mx.cpu(1), 'stage2' : mx.cpu(2)}, 'data': (1,200)})
-
-
 def test_blockgrad():
     a = mx.sym.Variable('a')
     b = mx.sym.BlockGrad(2*a)
diff --git a/tests/python/unittest/test_viz.py b/tests/python/unittest/test_viz.py
index 74f810ec6798..8e8ebfa9c153 100644
--- a/tests/python/unittest/test_viz.py
+++ b/tests/python/unittest/test_viz.py
@@ -53,7 +53,6 @@ def test_plot_network():
     net = mx.sym.FullyConnected(data=net, name='fc', num_hidden=128)
     net = mx.sym.Activation(data=net, name='relu1', act_type="relu")
     net = mx.sym.FullyConnected(data=net, name='fc', num_hidden=10)
-    net = mx.sym.SoftmaxOutput(data=net, name='out')
     with warnings.catch_warnings(record=True) as w:
         digraph = mx.viz.plot_network(net, shape={'data': (100, 200)},
                                       dtype={'data': np.float32},
@@ -61,4 +60,3 @@ def test_plot_network():
     assert len(w) == 1
     assert "There are multiple variables with the same name in your graph" in str(w[-1].message)
     assert "fc" in str(w[-1].message)
-
diff --git a/tests/tutorials/test_sanity_tutorials.py b/tests/tutorials/test_sanity_tutorials.py
index fb751b4ddade..ef31aa7dc612 100644
--- a/tests/tutorials/test_sanity_tutorials.py
+++ b/tests/tutorials/test_sanity_tutorials.py
@@ -39,19 +39,12 @@
              'nlp/index.md',
              'onnx/index.md',
              'python/index.md',
-             'r/CallbackFunction.md',
              'r/charRnnModel.md',
              'r/classifyRealImageWithPretrainedModel.md',
-             'r/CustomIterator.md',
-             'r/CustomLossFunction.md',
-             'r/fiveMinutesNeuralNetwork.md',
              'r/index.md',
-             'r/mnistCompetition.md',
              'r/MultidimLstm.md',
              'r/ndarray.md',
              'r/symbol.md',
-             'scala/char_lstm.md',
-             'scala/mnist.md',
              'scala/index.md',
              'scala/mxnet_scala_on_intellij.md',
              'scala/mxnet_java_install_and_run_examples.md',