From 7584efd82ee5e1b547246cf0fc41c817d5d0d16e Mon Sep 17 00:00:00 2001
From: "Joshua Z. Zhang" <cheungchih@gmail.com>
Date: Fri, 17 Apr 2020 12:01:07 -0700
Subject: [PATCH 01/17] c++ dataloader and built-in image/bbox

---
 cd/README.md                                  |   4 +-
 cd/utils/artifact_repository.md               |   7 +-
 ci/docker/Dockerfile.build.test.armv8         |   5 +
 ci/docker/Dockerfile.build.ubuntu_rat         |   4 +-
 ci/docker/Dockerfile.publish.centos7_gpu_cu90 |  43 ++
 .../Dockerfile.publish.test.ubuntu1604_cpu    |  18 +-
 .../Dockerfile.publish.test.ubuntu1604_gpu    |  24 +-
 .../Dockerfile.publish.test.ubuntu1804_cpu    |  11 +-
 ci/docker/install/ubuntu_julia.sh             |   3 -
 ci/docker/install/ubuntu_scala.sh             |   5 +-
 ci/docker/install/ubuntu_tutorials.sh         |   9 +-
 config/distribution/linux_cu90.cmake          |  36 +
 config/distribution/linux_cu91.cmake          |  36 +
 .../clojure-package/examples/rnn/get_data.sh  |   0
 include/mxnet/c_api.h                         | 156 ++++
 include/mxnet/io.h                            | 102 ++-
 make/staticbuild/linux_cu90.mk                | 180 +++++
 make/staticbuild/linux_cu91.mk                | 180 +++++
 python/mxnet/base.py                          |   2 +
 python/mxnet/gluon/contrib/data/__init__.py   |   1 +
 .../gluon/contrib/data/vision/__init__.py     |  22 +
 .../gluon/contrib/data/vision/dataloader.py   | 521 +++++++++++++
 .../data/vision/transforms/__init__.py        |  21 +
 .../data/vision/transforms/bbox/__init__.py   |  26 +
 .../data/vision/transforms/bbox/bbox.py       | 344 +++++++++
 .../data/vision/transforms/bbox/utils.py      | 428 +++++++++++
 python/mxnet/gluon/data/__init__.py           |   2 +
 python/mxnet/gluon/data/_internal.py          | 353 +++++++++
 python/mxnet/gluon/data/batchify.py           | 415 +++++++++++
 python/mxnet/gluon/data/dataloader.py         | 201 ++++-
 python/mxnet/gluon/data/dataset.py            | 110 +++
 python/mxnet/gluon/data/vision/datasets.py    | 115 ++-
 .../gluon/data/vision/transforms/__init__.py  | 197 +++++
 .../{transforms.py => transforms/image.py}    | 314 ++++----
 python/mxnet/gluon/nn/basic_layers.py         |  20 +-
 python/mxnet/image/image.py                   |   8 +-
 python/mxnet/io/io.py                         |  38 +-
 python/mxnet/ndarray/numpy/_op.py             |  93 +++
 python/mxnet/numpy/multiarray.py              | 174 +++++
 python/mxnet/util.py                          |  26 +
 .../examples/scripts/module/mnist_mlp.sh      |   7 +-
 src/c_api/c_api.cc                            | 237 +++++-
 src/imperative/cached_op.cc                   |   8 +-
 src/imperative/cached_op.h                    |  18 +-
 src/imperative/imperative_utils.cc            |  13 +-
 src/imperative/imperative_utils.h             |  91 ++-
 src/imperative/naive_cached_op.cc             | 108 +++
 src/imperative/naive_cached_op.h              |  72 ++
 src/io/batchify.cc                            | 397 ++++++++++
 src/io/dataloader.cc                          | 195 +++++
 src/io/dataset.cc                             | 697 ++++++++++++++++++
 src/io/image_iter_common.h                    |  48 +-
 src/io/io.cc                                  |   3 +
 src/io/iter_batchloader.h                     | 141 ++++
 src/io/iter_prefetcher.h                      |  20 +-
 src/io/iter_sampler.cc                        | 182 +++++
 src/operator/image/crop-inl.h                 | 306 ++++++++
 src/operator/image/crop.cc                    |  74 +-
 src/operator/image/crop.cu                    |  11 +
 src/operator/image/image_random-inl.h         |  18 +-
 src/operator/image/image_random.cc            |   5 +
 src/operator/image/resize-inl.h               |  36 +-
 .../test_contrib_gluon_data_vision.py         | 151 ++++
 tests/python/unittest/test_gluon_data.py      | 236 +++++-
 .../python/unittest/test_gluon_data_vision.py | 141 ++++
 .../test_numpy_contrib_gluon_data_vision.py   | 152 ++++
 .../unittest/test_numpy_gluon_data_vision.py  | 412 +++++++++++
 tests/python/unittest/test_numpy_op.py        |  50 ++
 tests/python/unittest/test_optimizer.py       |   1 -
 tools/pip/doc/CU90_ADDITIONAL.md              |  47 ++
 tools/staticbuild/README.md                   |   2 +-
 71 files changed, 7819 insertions(+), 314 deletions(-)
 create mode 100644 ci/docker/Dockerfile.publish.centos7_gpu_cu90
 mode change 100755 => 100644 ci/docker/install/ubuntu_tutorials.sh
 create mode 100644 config/distribution/linux_cu90.cmake
 create mode 100644 config/distribution/linux_cu91.cmake
 mode change 100755 => 100644 contrib/clojure-package/examples/rnn/get_data.sh
 create mode 100644 make/staticbuild/linux_cu90.mk
 create mode 100644 make/staticbuild/linux_cu91.mk
 create mode 100644 python/mxnet/gluon/contrib/data/vision/__init__.py
 create mode 100644 python/mxnet/gluon/contrib/data/vision/dataloader.py
 create mode 100644 python/mxnet/gluon/contrib/data/vision/transforms/__init__.py
 create mode 100644 python/mxnet/gluon/contrib/data/vision/transforms/bbox/__init__.py
 create mode 100644 python/mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py
 create mode 100644 python/mxnet/gluon/contrib/data/vision/transforms/bbox/utils.py
 create mode 100644 python/mxnet/gluon/data/_internal.py
 create mode 100644 python/mxnet/gluon/data/batchify.py
 create mode 100644 python/mxnet/gluon/data/vision/transforms/__init__.py
 rename python/mxnet/gluon/data/vision/{transforms.py => transforms/image.py} (68%)
 create mode 100644 src/imperative/naive_cached_op.cc
 create mode 100644 src/imperative/naive_cached_op.h
 create mode 100644 src/io/batchify.cc
 create mode 100644 src/io/dataloader.cc
 create mode 100644 src/io/dataset.cc
 create mode 100644 src/io/iter_sampler.cc
 create mode 100644 tests/python/unittest/test_contrib_gluon_data_vision.py
 create mode 100644 tests/python/unittest/test_numpy_contrib_gluon_data_vision.py
 create mode 100644 tests/python/unittest/test_numpy_gluon_data_vision.py
 create mode 100644 tools/pip/doc/CU90_ADDITIONAL.md

diff --git a/cd/README.md b/cd/README.md
index 30cd44bd1d14..8247af964906 100644
--- a/cd/README.md
+++ b/cd/README.md
@@ -25,7 +25,7 @@ MXNet aims to support a variety of frontends, e.g. Python, Java, Perl, R, etc. a
 
 The CD process is driven by the [CD pipeline job](Jenkinsfile_cd_pipeline), which orchestrates the order in which the artifacts are delivered. For instance, first publish the libmxnet library before publishing the pip package. It does this by triggering the [release job](Jenkinsfile_release_job) with a specific set of parameters for each delivery channel. The release job executes the specific release pipeline for a delivery channel across all MXNet *variants*.
 
-A variant is a specific environment or features for which MXNet is compiled. For instance CPU, GPU with CUDA v10.0, CUDA v9.0 with MKL-DNN support, etc. 
+A variant is a specific environment or features for which MXNet is compiled. For instance CPU, GPU with CUDA v10.0, CUDA v9.0 with MKL-DNN support, etc.
 
 Currently, below variants are supported. All of these variants except native have MKL-DNN backend enabled.
 
@@ -120,7 +120,7 @@ The "first mile" of the CD process is posting the mxnet binaries to the [artifac
 
 ##### Timeout
 
-We shouldn't set global timeouts for the pipelines. Rather, the `step` being executed should be rapped with a `timeout` function (as in the pipeline example above). The `max_time` is a global variable set at the [release job](Jenkinsfile_release_job) level. 
+We shouldn't set global timeouts for the pipelines. Rather, the `step` being executed should be rapped with a `timeout` function (as in the pipeline example above). The `max_time` is a global variable set at the [release job](Jenkinsfile_release_job) level.
 
 ##### Node of execution
 
diff --git a/cd/utils/artifact_repository.md b/cd/utils/artifact_repository.md
index 5ee736f2d26e..80297efca2c8 100644
--- a/cd/utils/artifact_repository.md
+++ b/cd/utils/artifact_repository.md
@@ -33,7 +33,7 @@ An mxnet compiled library, or artifact for our purposes, is identified by the fo
 
 **Commit Id**
 
-Manually configured through the --git-sha argument. 
+Manually configured through the --git-sha argument.
 
 If not set, derived by:
 
@@ -59,7 +59,7 @@ As long as the tool is being run from the MXNet code base, the runtime feature d
 
 If it has been compiled with CUDA support, the output of /usr/local/cuda/bin/nvcc --version can be mined for the exact CUDA version (eg. 8.0, 9.0, etc.).
 
-By knowing which features are enabled on the binary, and if necessary, which CUDA version is installed on the machine, the value for the variant argument can be calculated. Eg. if CUDA features are enabled, and nvcc reports cuda version 10, then the variant would be cu100. If neither MKL-DNN nor CUDA features are enabled, the variant would be native. 
+By knowing which features are enabled on the binary, and if necessary, which CUDA version is installed on the machine, the value for the variant argument can be calculated. Eg. if CUDA features are enabled, and nvcc reports cuda version 10, then the variant would be cu100. If neither MKL-DNN nor CUDA features are enabled, the variant would be native.
 
 **Dependency Linking**
 
@@ -68,7 +68,7 @@ The library dependencies can be either statically or dynamically linked. This pr
 ### Uploading an Artifact
 
 The user must specify the path to the libmxnet.so, any license files, and any dependencies. The latter two are optional.
- 
+
 Example:
 
 `./artifact_repository.py --push --static --libmxnet /path/to/libmxnet.so --licenses path/to/license1.txt /path/to/other_licenses/*.txt --dependencies /path/to/dependencies/*.so`
@@ -102,4 +102,3 @@ dist
 ```
 
 The libmxnet.meta file will include the characteristics of the artifact (ie. library type, variant, git commit id, etc.) in a “property” file format.
-
diff --git a/ci/docker/Dockerfile.build.test.armv8 b/ci/docker/Dockerfile.build.test.armv8
index 7a77c78bbeea..c10d23ad8efe 100644
--- a/ci/docker/Dockerfile.build.test.armv8
+++ b/ci/docker/Dockerfile.build.test.armv8
@@ -22,6 +22,11 @@ FROM arm64v8/ubuntu:20.04
 
 WORKDIR /usr/local
 
+
+WORKDIR /usr/local
+
+COPY install/ubuntu_rat.sh /work/
+RUN /work/ubuntu_rat.sh
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
     python3 \
     python3-pip \
diff --git a/ci/docker/Dockerfile.build.ubuntu_rat b/ci/docker/Dockerfile.build.ubuntu_rat
index 234d2e42e946..7536057d73ff 100644
--- a/ci/docker/Dockerfile.build.ubuntu_rat
+++ b/ci/docker/Dockerfile.build.ubuntu_rat
@@ -20,7 +20,7 @@
 
 FROM ubuntu:16.04
 
-WORKDIR /work/deps
+WORKDIR /usr/local
 
 COPY install/ubuntu_rat.sh /work/
 RUN /work/ubuntu_rat.sh
@@ -31,6 +31,4 @@ COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
-
 WORKDIR /work/mxnet
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/Dockerfile.publish.centos7_gpu_cu90 b/ci/docker/Dockerfile.publish.centos7_gpu_cu90
new file mode 100644
index 000000000000..23217148f87c
--- /dev/null
+++ b/ci/docker/Dockerfile.publish.centos7_gpu_cu90
@@ -0,0 +1,43 @@
+# -*- mode: dockerfile -*-
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+FROM nvidia/cuda:9.0-cudnn7-devel-centos7
+
+WORKDIR /work/deps
+
+COPY install/centos7_base.sh /work/
+RUN /work/centos7_base.sh
+COPY install/centos7_ccache.sh /work/
+RUN /work/centos7_ccache.sh
+COPY install/centos7_python.sh /work/
+RUN /work/centos7_python.sh
+COPY install/centos7_scala.sh /work/
+RUN /work/centos7_scala.sh
+ENV SHORT_CUDA_VERSION=9.0
+ENV SHORT_NCCL_VERSION=2.4.8
+COPY install/centos7_nccl.sh /work/
+RUN /work/centos7_nccl.sh
+
+ARG USER_ID=0
+COPY install/centos7_adduser.sh /work/
+RUN /work/centos7_adduser.sh
+
+ENV PYTHONPATH=./python/
+WORKDIR /work/mxnet
+
+COPY runtime_functions.sh /work/
diff --git a/ci/docker/Dockerfile.publish.test.ubuntu1604_cpu b/ci/docker/Dockerfile.publish.test.ubuntu1604_cpu
index bbb7b6a0d7bd..0f7ff1e3714b 100644
--- a/ci/docker/Dockerfile.publish.test.ubuntu1604_cpu
+++ b/ci/docker/Dockerfile.publish.test.ubuntu1604_cpu
@@ -20,13 +20,17 @@
 
 FROM ubuntu:16.04
 
-WORKDIR /work/deps
+WORKDIR /usr/local
 
-COPY install/ubuntu_base.sh /work/
-RUN /work/ubuntu_base.sh
-
-COPY install/ubuntu_scala.sh /work/
-RUN /work/ubuntu_scala.sh
+RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
+    python3 \
+    python3-pip \
+    python3-numpy \
+    python3-scipy \
+    python3-nose \
+    python3-nose-timer \
+    python3-requests \
+ && rm -rf /var/lib/apt/lists/*
 
 ARG USER_ID=0
 ARG GROUP_ID=0
@@ -34,6 +38,4 @@ COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
-
 WORKDIR /work/mxnet
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/Dockerfile.publish.test.ubuntu1604_gpu b/ci/docker/Dockerfile.publish.test.ubuntu1604_gpu
index 660461dc0cfa..8276536bf10a 100644
--- a/ci/docker/Dockerfile.publish.test.ubuntu1604_gpu
+++ b/ci/docker/Dockerfile.publish.test.ubuntu1604_gpu
@@ -22,18 +22,20 @@ FROM nvidia/cuda:9.2-cudnn7-devel-ubuntu16.04
 
 WORKDIR /work/deps
 
-COPY install/ubuntu_base.sh /work/
-RUN /work/ubuntu_base.sh
-
-COPY install/ubuntu_scala.sh /work/
-RUN /work/ubuntu_scala.sh
+COPY install/centos7_base.sh /work/
+RUN /work/centos7_base.sh
+COPY install/centos7_ccache.sh /work/
+RUN /work/centos7_ccache.sh
+COPY install/centos7_python.sh /work/
+RUN /work/centos7_python.sh
+COPY install/centos7_scala.sh /work/
+RUN /work/centos7_scala.sh
 
 ARG USER_ID=0
-ARG GROUP_ID=0
-COPY install/ubuntu_adduser.sh /work/
-RUN /work/ubuntu_adduser.sh
-
-COPY runtime_functions.sh /work/
+COPY install/centos7_adduser.sh /work/
+RUN /work/centos7_adduser.sh
 
+ENV PYTHONPATH=./python/
 WORKDIR /work/mxnet
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
+
+COPY runtime_functions.sh /work/
diff --git a/ci/docker/Dockerfile.publish.test.ubuntu1804_cpu b/ci/docker/Dockerfile.publish.test.ubuntu1804_cpu
index e3a8c193f234..fe3a955b9a73 100644
--- a/ci/docker/Dockerfile.publish.test.ubuntu1804_cpu
+++ b/ci/docker/Dockerfile.publish.test.ubuntu1804_cpu
@@ -31,11 +31,10 @@ COPY install/ubuntu_scala.sh /work/
 RUN /work/ubuntu_scala.sh
 
 ARG USER_ID=0
-ARG GROUP_ID=0
-COPY install/ubuntu_adduser.sh /work/
-RUN /work/ubuntu_adduser.sh
-
-COPY runtime_functions.sh /work/
+COPY install/centos7_adduser.sh /work/
+RUN /work/centos7_adduser.sh
 
+ENV PYTHONPATH=./python/
 WORKDIR /work/mxnet
-ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
+
+COPY runtime_functions.sh /work/
diff --git a/ci/docker/install/ubuntu_julia.sh b/ci/docker/install/ubuntu_julia.sh
index 435ec46db6c7..348a6d13da07 100755
--- a/ci/docker/install/ubuntu_julia.sh
+++ b/ci/docker/install/ubuntu_julia.sh
@@ -17,9 +17,6 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
-
 set -ex
 
 function install_julia() {
diff --git a/ci/docker/install/ubuntu_scala.sh b/ci/docker/install/ubuntu_scala.sh
index 355e978e075c..e3afd8e728ee 100755
--- a/ci/docker/install/ubuntu_scala.sh
+++ b/ci/docker/install/ubuntu_scala.sh
@@ -17,8 +17,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
+# Install Thrust 1.9.8 to be shipped with Cuda 11.
+# Fixes https://github.com/thrust/thrust/issues/1072 for Clang 10
+# This file can be deleted when using Cuda 11 on CI
 
 set -ex
 
diff --git a/ci/docker/install/ubuntu_tutorials.sh b/ci/docker/install/ubuntu_tutorials.sh
old mode 100755
new mode 100644
index 469df6190ea4..0b2204f1f9a5
--- a/ci/docker/install/ubuntu_tutorials.sh
+++ b/ci/docker/install/ubuntu_tutorials.sh
@@ -1,5 +1,3 @@
-#!/usr/bin/env bash
-
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -17,8 +15,11 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# build and install are separated so changes to build don't invalidate
-# the whole docker cache for the image
+set(CMAKE_SYSTEM_NAME Linux)
+set(CMAKE_SYSTEM_PROCESSOR "armv7l")
+set(CMAKE_C_COMPILER arm-linux-gnueabihf-gcc)
+set(CMAKE_CXX_COMPILER arm-linux-gnueabihf-g++)
+set(CMAKE_FIND_ROOT_PATH "/usr/arm-linux-gnueabihf" "/usr/local/arm-linux-gnueabihf")
 
 set -ex
 apt-get update || true
diff --git a/config/distribution/linux_cu90.cmake b/config/distribution/linux_cu90.cmake
new file mode 100644
index 000000000000..e4249cd609c8
--- /dev/null
+++ b/config/distribution/linux_cu90.cmake
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set(CMAKE_BUILD_TYPE "Distribution" CACHE STRING "Build type")
+set(CFLAGS "-mno-avx" CACHE STRING "CFLAGS")
+set(CXXFLAGS "-mno-avx" CACHE STRING "CXXFLAGS")
+
+set(USE_CUDA ON CACHE BOOL "Build with CUDA support")
+set(USE_CUDNN ON CACHE BOOL "Build with CUDNN support")
+set(USE_NCCL ON CACHE BOOL "Build with NCCL support")
+set(USE_OPENCV ON CACHE BOOL "Build with OpenCV support")
+set(USE_OPENMP ON CACHE BOOL "Build with Openmp support")
+set(USE_MKL_IF_AVAILABLE OFF CACHE BOOL "Use Intel MKL if found")
+set(USE_MKLDNN ON CACHE BOOL "Build with MKL-DNN support")
+set(USE_LAPACK ON CACHE BOOL "Build with lapack support")
+set(USE_TVM_OP OFF CACHE BOOL "Enable use of TVM operator build system.")
+set(USE_SSE ON CACHE BOOL "Build with x86 SSE instruction support")
+set(USE_F16C OFF CACHE BOOL "Build with x86 F16C instruction support")
+set(USE_LIBJPEG_TURBO ON CACHE BOOL "Build with libjpeg-turbo")
+
+set(CUDACXX "/usr/local/cuda-9.0/bin/nvcc" CACHE STRING "Cuda compiler")
+set(MXNET_CUDA_ARCH "3.0;5.0;6.0;7.0;7.2" CACHE STRING "Cuda architectures")
diff --git a/config/distribution/linux_cu91.cmake b/config/distribution/linux_cu91.cmake
new file mode 100644
index 000000000000..a239ada43454
--- /dev/null
+++ b/config/distribution/linux_cu91.cmake
@@ -0,0 +1,36 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+set(CMAKE_BUILD_TYPE "Distribution" CACHE STRING "Build type")
+set(CFLAGS "-mno-avx" CACHE STRING "CFLAGS")
+set(CXXFLAGS "-mno-avx" CACHE STRING "CXXFLAGS")
+
+set(USE_CUDA ON CACHE BOOL "Build with CUDA support")
+set(USE_CUDNN ON CACHE BOOL "Build with CUDNN support")
+set(USE_NCCL ON CACHE BOOL "Build with NCCL support")
+set(USE_OPENCV ON CACHE BOOL "Build with OpenCV support")
+set(USE_OPENMP ON CACHE BOOL "Build with Openmp support")
+set(USE_MKL_IF_AVAILABLE OFF CACHE BOOL "Use Intel MKL if found")
+set(USE_MKLDNN ON CACHE BOOL "Build with MKL-DNN support")
+set(USE_LAPACK ON CACHE BOOL "Build with lapack support")
+set(USE_TVM_OP OFF CACHE BOOL "Enable use of TVM operator build system.")
+set(USE_SSE ON CACHE BOOL "Build with x86 SSE instruction support")
+set(USE_F16C OFF CACHE BOOL "Build with x86 F16C instruction support")
+set(USE_LIBJPEG_TURBO ON CACHE BOOL "Build with libjpeg-turbo")
+
+set(CUDACXX "/usr/local/cuda-9.1/bin/nvcc" CACHE STRING "Cuda compiler")
+set(MXNET_CUDA_ARCH "3.0;5.0;6.0;7.0;7.2" CACHE STRING "Cuda architectures")
diff --git a/contrib/clojure-package/examples/rnn/get_data.sh b/contrib/clojure-package/examples/rnn/get_data.sh
old mode 100755
new mode 100644
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index d3f0cc96625c..e5dd1f21820d 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -81,6 +81,14 @@ typedef void *ExecutorHandle;
 typedef void *DataIterCreator;
 /*! \brief handle to a DataIterator */
 typedef void *DataIterHandle;
+/*! \brief handle a dataset creator */
+typedef void *DatasetCreator;
+/*! \brief handle to a Dataset */
+typedef void *DatasetHandle;
+/*! \brief handle to a BatchifyFunction creator*/
+typedef void *BatchifyFunctionCreator;
+/*! \brief handle to a BatchifyFunction */
+typedef void *BatchifyFunctionHandle;
 /*! \brief handle to KVStore */
 typedef void *KVStoreHandle;
 /*! \brief handle to RecordIO */
@@ -2670,6 +2678,13 @@ MXNET_DLL int MXDataIterNext(DataIterHandle handle,
  */
 MXNET_DLL int MXDataIterBeforeFirst(DataIterHandle handle);
 
+/*!
+ * \brief Call iterator.GetLenHint. Note that some iterators don't provide length.
+ * \param handle the handle to iterator
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXDataIterGetLenHint(DataIterHandle handle,
+                                   int64_t *len);
 /*!
  * \brief Get the handle to the NDArray of underlying data
  * \param handle the handle pointer to the data iterator
@@ -2705,6 +2720,147 @@ MXNET_DLL int MXDataIterGetPadNum(DataIterHandle handle,
  */
 MXNET_DLL int MXDataIterGetLabel(DataIterHandle handle,
                                  NDArrayHandle *out);
+/*!
+ * \brief Get the handles to specified underlying ndarrays of index
+ * \param handle the handle pointer to the data iterator
+ * \param num_outputs the length of outputs
+ * \param out the handle to an array of NDArrays that stores pointers to handles
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXDataIterGetItems(DataIterHandle handle,
+                                int* num_outputs,
+                                NDArrayHandle **outputs);
+
+/*!
+ * \brief List all the available dataset entries
+ * \param out_size the size of returned datasets
+ * \param out_array the output dataset entries
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXListDatasets(uint32_t *out_size,
+                             DatasetCreator **out_array);
+/*!
+ * \brief Init an dataset, init with parameters
+ * the array size of passed in arguments
+ * \param handle of the dataset creator
+ * \param num_param number of parameter
+ * \param keys parameter keys
+ * \param vals parameter values
+ * \param out resulting dataset
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXDatasetCreateDataset(DatasetCreator handle,
+                                     uint32_t num_param,
+                                     const char **keys,
+                                     const char **vals,
+                                     DatasetHandle *out);
+/*!
+ * \brief Get the detailed information about dataset.
+ * \param creator the DatasetCreator.
+ * \param name The returned name of the creator.
+ * \param description The returned description of the symbol.
+ * \param num_args Number of arguments.
+ * \param arg_names Name of the arguments.
+ * \param arg_type_infos Type informations about the arguments.
+ * \param arg_descriptions Description information about the arguments.
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXDatasetGetDatasetInfo(DatasetCreator creator,
+                                      const char **name,
+                                      const char **description,
+                                      uint32_t *num_args,
+                                      const char ***arg_names,
+                                      const char ***arg_type_infos,
+                                      const char ***arg_descriptions);
+/*!
+ * \brief Free the handle to the IO module
+ * \param handle the handle pointer to the dataset
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXDatasetFree(DatasetHandle handle);
+/*!
+ * \brief Get dataset overal length(size)
+ * \param handle the handle to dataset
+ * \param out return value of GetLen
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXDatasetGetLen(DatasetHandle handle,
+                              uint64_t *out);
+/*!
+ * \brief Get Output NDArray given specified indices
+ * \param handle the handle to dataset
+ * \param index the index of the dataset item to be retrieved
+ * \param num_outputs the number of output ndarrays
+ * \param outputs the pointers to handles of ndarrays
+ * \param is_scalar if not zeros then output should be casted to scalars
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXDatasetGetItems(DatasetHandle handle,
+                                uint64_t index,
+                                int* num_outputs,
+                                NDArrayHandle **outputs);
+
+/*!
+ * \brief List all the available batchify function entries
+ * \param out_size the size of returned batchify functions
+ * \param out_array the output batchify function entries
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXListBatchifyFunctions(uint32_t *out_size,
+                                      BatchifyFunctionCreator **out_array);
+/*!
+ * \brief Init an batchify function, init with parameters
+ * the array size of passed in arguments
+ * \param handle of the batchify function creator
+ * \param num_param number of parameter
+ * \param keys parameter keys
+ * \param vals parameter values
+ * \param out resulting batchify function
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXBatchifyFunctionCreateFunction(BatchifyFunctionCreator handle,
+                                     uint32_t num_param,
+                                     const char **keys,
+                                     const char **vals,
+                                     BatchifyFunctionHandle *out);
+/*!
+ * \brief Get the detailed information about batchify function.
+ * \param creator the batchifyFunctionCreator.
+ * \param name The returned name of the creator.
+ * \param description The returned description of the symbol.
+ * \param num_args Number of arguments.
+ * \param arg_names Name of the arguments.
+ * \param arg_type_infos Type informations about the arguments.
+ * \param arg_descriptions Description information about the arguments.
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXBatchifyFunctionGetFunctionInfo(BatchifyFunctionCreator creator,
+                                      const char **name,
+                                      const char **description,
+                                      uint32_t *num_args,
+                                      const char ***arg_names,
+                                      const char ***arg_type_infos,
+                                      const char ***arg_descriptions);
+/*!
+ * \brief Invoke the Batchify Function
+ * \param handle the handle pointer to the batchify function
+ * \param batch_size the batch size
+ * \param num_output the number of ndarrays for output
+ * \param inputs the pointers to input ndarrays
+ * \param ouptuts the pointers to output ndarrays
+ * \return 0 when success, -1 when failure happens
+ */                                      
+MXNET_DLL int MXBatchifyFunctionInvoke(BatchifyFunctionHandle handle,
+                                       int batch_size,
+                                       int num_output,
+                                       NDArrayHandle *inputs,
+                                       NDArrayHandle **outputs);
+/*!
+ * \brief Free the handle to the IO module
+ * \param handle the handle pointer to the batchify function
+ * \return 0 when success, -1 when failure happens
+ */
+MXNET_DLL int MXBatchifyFunctionFree(BatchifyFunctionHandle handle);
 //--------------------------------------------
 // Part 6: basic KVStore interface
 //--------------------------------------------
diff --git a/include/mxnet/io.h b/include/mxnet/io.h
index e18f03ed0ef3..177be27a7c12 100644
--- a/include/mxnet/io.h
+++ b/include/mxnet/io.h
@@ -61,6 +61,13 @@ class IIterator : public dmlc::DataIter<DType> {
   inline void SetDataName(const std::string data_name) {
     data_names.push_back(data_name);
   }
+  /*! \brief request iterator length hint for current epoch.
+   * Note that the returned value can be < 0, indicating
+   * that the length of iterator is unknown unless you went through all data.
+   */
+  virtual int64_t GetLenHint(void) const {
+    return -1;
+  }
 };  // class IIterator
 
 /*! \brief a single data instance */
@@ -104,7 +111,7 @@ struct DataIteratorReg
  *
  * \code
  * // example of registering a mnist iterator
- * REGISTER_IO_ITE(MNISTIter)
+ * REGISTER_IO_ITER(MNISTIter)
  * .describe("Mnist data iterator")
  * .set_body([]() {
  *     return new PrefetcherIter(new MNISTIter());
@@ -113,5 +120,98 @@ struct DataIteratorReg
  */
 #define MXNET_REGISTER_IO_ITER(name)                                    \
   DMLC_REGISTRY_REGISTER(::mxnet::DataIteratorReg, DataIteratorReg, name)
+
+/*!
+ * \brief A random accessable dataset which provides GetLen() and GetItem().
+ * Unlike DataIter, it's a static lookup storage which is friendly to random access.
+ * The dataset itself should NOT contain data processing, which should be applied during
+ * data augmentation or transformation processes.
+ */
+class Dataset {
+ public:
+  /*!
+  *  \brief Get the size of the dataset
+  */
+  virtual uint64_t GetLen(void) const = 0;
+  /*!
+  *  \brief Create a copy of dataset for threaded worker
+  */
+  virtual Dataset* Clone(void) const = 0;
+  /*!
+  *  \brief Get the ndarray items given index in dataset
+  *  \param idx the integer index for required data
+  *  \param ret the returned ndarray items
+  */
+  virtual bool GetItem(uint64_t idx, std::vector<NDArray>* ret) = 0;
+  // virtual destructor
+  virtual ~Dataset(void) {}
+};  // class Dataset
+
+/*! \brief typedef the factory function of dataset */
+typedef std::function<Dataset *(
+  const std::vector<std::pair<std::string, std::string> >&)> DatasetFactory;
+/*!
+ * \brief Registry entry for Dataset factory functions.
+ */
+struct DatasetReg
+    : public dmlc::FunctionRegEntryBase<DatasetReg,
+                                        DatasetFactory> {
+};
+//--------------------------------------------------------------
+// The following part are API Registration of Datasets
+//--------------------------------------------------------------
+/*!
+ * \brief Macro to register Datasets
+ *
+ * \code
+ * // example of registering an image sequence dataset
+ * REGISTER_IO_ITE(ImageSequenceDataset)
+ * .describe("image sequence dataset")
+ * .set_body([]() {
+ *     return new ImageSequenceDataset();
+ *   });
+ * \endcode
+ */
+#define MXNET_REGISTER_IO_DATASET(name)                                    \
+  DMLC_REGISTRY_REGISTER(::mxnet::DatasetReg, DatasetReg, name)
+
+class BatchifyFunction {
+ public:
+  /*! \brief Destructor */
+  virtual ~BatchifyFunction(void) {}
+  /*! \brief The batchify logic */
+  virtual bool Batchify(const std::vector<std::vector<NDArray> >& inputs,
+                        std::vector<NDArray>* outputs) = 0;
+};  // class BatchifyFunction
+
+using BatchifyFunctionPtr = std::shared_ptr<BatchifyFunction>;
+
+/*! \brief typedef the factory function of data sampler */
+typedef std::function<BatchifyFunction *(
+  const std::vector<std::pair<std::string, std::string> >&)> BatchifyFunctionFactory;
+/*!
+ * \brief Registry entry for DataSampler factory functions.
+ */
+struct BatchifyFunctionReg
+    : public dmlc::FunctionRegEntryBase<BatchifyFunctionReg,
+                                        BatchifyFunctionFactory> {
+};
+//--------------------------------------------------------------
+// The following part are API Registration of Batchify Function
+//--------------------------------------------------------------
+/*!
+ * \brief Macro to register Batchify Functions
+ *
+ * \code
+ * // example of registering a Batchify Function
+ * MXNET_REGISTER_IO_BATCHIFY_FUNCTION(StackBatchify)
+ * .describe("Stack Batchify Function")
+ * .set_body([]() {
+ *     return new StackBatchify();
+ *   });
+ * \endcode
+ */
+#define MXNET_REGISTER_IO_BATCHIFY_FUNCTION(name)                                    \
+  DMLC_REGISTRY_REGISTER(::mxnet::BatchifyFunctionReg, BatchifyFunctionReg, name)
 }  // namespace mxnet
 #endif  // MXNET_IO_H_
diff --git a/make/staticbuild/linux_cu90.mk b/make/staticbuild/linux_cu90.mk
new file mode 100644
index 000000000000..1d0669ef82b6
--- /dev/null
+++ b/make/staticbuild/linux_cu90.mk
@@ -0,0 +1,180 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making python wheel
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ifdef USE_SYSTEM_CUDA
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+else
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+endif
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+ifdef USE_SYSTEM_CUDA
+USE_CUDA_PATH = /usr/local/cuda-9.0
+else
+USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-9.0
+endif
+
+# whether to use CuDNN library
+USE_CUDNN = 1
+
+# whether to use NCCL library
+USE_NCCL = 1
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+ENABLE_CUDA_RTC = 1
+
+USE_NVTX=1
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 1
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/staticbuild/linux_cu91.mk b/make/staticbuild/linux_cu91.mk
new file mode 100644
index 000000000000..89b35b10f6fa
--- /dev/null
+++ b/make/staticbuild/linux_cu91.mk
@@ -0,0 +1,180 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling mxnet for making python wheel
+#-------------------------------------------------------------------------------
+
+#---------------------
+# choice of compiler
+#--------------------
+
+export CC = gcc
+export CXX = g++
+export NVCC = nvcc
+
+# whether compile with options for MXNet developer
+DEV = 0
+
+# whether compile with debug
+DEBUG = 0
+
+# whether to turn on signal handler (e.g. segfault logger)
+USE_SIGNAL_HANDLER = 1
+
+# the additional link flags you want to add
+ifdef USE_SYSTEM_CUDA
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+else
+ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
+endif
+
+# the additional compile flags you want to add
+ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
+
+#---------------------------------------------
+# matrix computation libraries for CPU/GPU
+#---------------------------------------------
+
+# choose the version of blas you want to use
+# can be: mkl, blas, atlas, openblas
+# in default use atlas for linux while apple for osx
+USE_BLAS=openblas
+
+# whether use opencv during compilation
+# you can disable it, however, you will not able to use
+# imbin iterator
+USE_OPENCV = 1
+# Add OpenCV include path, in which the directory `opencv2` exists
+USE_OPENCV_INC_PATH = NONE
+# Add OpenCV shared library path, in which the shared library exists
+USE_OPENCV_LIB_PATH = NONE
+
+# whether use CUDA during compile
+USE_CUDA = 1
+
+# add the path to CUDA library to link and compile flag
+# if you have already add them to environment variable, leave it as NONE
+# USE_CUDA_PATH = /usr/local/cuda
+ifdef USE_SYSTEM_CUDA
+USE_CUDA_PATH = /usr/local/cuda-9.1
+else
+USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-9.1
+endif
+
+# whether to use CuDNN library
+USE_CUDNN = 1
+
+# whether to use NCCL library
+USE_NCCL = 1
+
+# CUDA architecture setting: going with all of them.
+# For CUDA < 6.0, comment the *_50 lines for compatibility.
+# CUDA_ARCH :=
+
+# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
+ENABLE_CUDA_RTC = 1
+
+USE_NVTX=1
+
+# use openmp for parallelization
+USE_OPENMP = 1
+USE_OPERATOR_TUNING = 1
+USE_LIBJPEG_TURBO = 1
+
+# whether use MKL-DNN library
+USE_MKLDNN = 1
+
+# whether use NNPACK library
+USE_NNPACK = 0
+
+# whether use lapack during compilation
+# only effective when compiled with blas versions openblas/apple/atlas/mkl
+USE_LAPACK = 1
+
+# path to lapack library in case of a non-standard installation
+USE_LAPACK_PATH = $(DEPS_PATH)/lib
+
+# add path to intel library, you may need it for MKL, if you did not add the path
+# to environment variable
+USE_INTEL_PATH = NONE
+
+# If use MKL, choose static link automatically to allow python wrapper
+ifeq ($(USE_BLAS), mkl)
+USE_STATIC_MKL = 1
+else
+USE_STATIC_MKL = NONE
+endif
+
+#----------------------------
+# Settings for power and arm arch
+#----------------------------
+ARCH := $(shell uname -a)
+ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
+	USE_SSE=0
+else
+	USE_SSE=1
+endif
+
+#----------------------------
+# distributed computing
+#----------------------------
+
+# whether or not to enable multi-machine supporting
+USE_DIST_KVSTORE = 1
+
+# whether or not allow to read and write HDFS directly. If yes, then hadoop is
+# required
+USE_HDFS = 0
+
+# path to libjvm.so. required if USE_HDFS=1
+LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
+
+# whether or not allow to read and write AWS S3 directly. If yes, then
+# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
+# sudo apt-get install -y libcurl4-openssl-dev
+USE_S3 = 1
+
+#----------------------------
+# additional operators
+#----------------------------
+
+# path to folders containing projects specific operators that you don't want to put in src/operators
+EXTRA_OPERATORS =
+
+
+#----------------------------
+# plugins
+#----------------------------
+
+# whether to use caffe integration. This requires installing caffe.
+# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
+# CAFFE_PATH = $(HOME)/caffe
+# MXNET_PLUGINS += plugin/caffe/caffe.mk
+
+# whether to use torch integration. This requires installing torch.
+# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
+# TORCH_PATH = $(HOME)/torch
+# MXNET_PLUGINS += plugin/torch/torch.mk
+
+# WARPCTC_PATH = $(HOME)/warp-ctc
+# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
+
+# whether to use sframe integration. This requires build sframe
+# git@github.com:dato-code/SFrame.git
+# SFRAME_PATH = $(HOME)/SFrame
+# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/python/mxnet/base.py b/python/mxnet/base.py
index 7a721f70cc3b..8e9700fa6c74 100644
--- a/python/mxnet/base.py
+++ b/python/mxnet/base.py
@@ -365,6 +365,8 @@ def _load_lib():
 ExecutorHandle = ctypes.c_void_p
 DataIterCreatorHandle = ctypes.c_void_p
 DataIterHandle = ctypes.c_void_p
+DatasetHandle = ctypes.c_void_p
+BatchifyFunctionhandle = ctypes.c_void_p
 KVStoreHandle = ctypes.c_void_p
 RecordIOHandle = ctypes.c_void_p
 RtcHandle = ctypes.c_void_p
diff --git a/python/mxnet/gluon/contrib/data/__init__.py b/python/mxnet/gluon/contrib/data/__init__.py
index 7cb25eb7498e..964d44cf10fe 100644
--- a/python/mxnet/gluon/contrib/data/__init__.py
+++ b/python/mxnet/gluon/contrib/data/__init__.py
@@ -20,5 +20,6 @@
 """Contrib datasets."""
 
 from . import text
+from . import vision
 
 from .sampler import *
diff --git a/python/mxnet/gluon/contrib/data/vision/__init__.py b/python/mxnet/gluon/contrib/data/vision/__init__.py
new file mode 100644
index 000000000000..ad71a5f6287a
--- /dev/null
+++ b/python/mxnet/gluon/contrib/data/vision/__init__.py
@@ -0,0 +1,22 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import
+"""Contrib vision utilities."""
+from .transforms import *
+from .dataloader import *
diff --git a/python/mxnet/gluon/contrib/data/vision/dataloader.py b/python/mxnet/gluon/contrib/data/vision/dataloader.py
new file mode 100644
index 000000000000..0c71d90453d8
--- /dev/null
+++ b/python/mxnet/gluon/contrib/data/vision/dataloader.py
@@ -0,0 +1,521 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable= arguments-differ, wildcard-import
+"Contrib Vision DataLoaders."
+import logging
+import numpy as np
+
+from ..... import nd
+from .....util import is_np_array
+from ..... import np as _mx_np   # pylint: disable=reimported
+from ....nn import HybridSequential, Sequential, HybridBlock, Block
+from ....data.vision import transforms
+from ....data import DataLoader
+from .transforms import bbox
+
+__all__ = ['create_image_augment', 'ImageDataLoader', 'ImageBboxDataLoader']
+
+def create_image_augment(data_shape, resize=0, rand_crop=False, rand_resize=False, rand_mirror=False,
+                         mean=None, std=None, brightness=0, contrast=0, saturation=0, hue=0,
+                         pca_noise=0, rand_gray=0, inter_method=2, dtype='float32'):
+    """Creates an augmenter block.
+
+    Parameters
+    ----------
+    data_shape : tuple of int
+        Shape for output data
+    resize : int
+        Resize shorter edge if larger than 0 at the begining
+    rand_crop : bool
+        Whether to enable random cropping other than center crop
+    rand_resize : bool
+        Whether to enable random sized cropping, require rand_crop to be enabled
+    rand_gray : float
+        [0, 1], probability to convert to grayscale for all channels, the number
+        of channels will not be reduced to 1
+    rand_mirror : bool
+        Whether to apply horizontal flip to image with probability 0.5
+    mean : np.ndarray or None
+        Mean pixel values for [r, g, b]
+    std : np.ndarray or None
+        Standard deviations for [r, g, b]
+    brightness : float
+        Brightness jittering range (percent)
+    contrast : float
+        Contrast jittering range (percent)
+    saturation : float
+        Saturation jittering range (percent)
+    hue : float
+        Hue jittering range (percent)
+    pca_noise : float
+        Pca noise level (percent)
+    inter_method : int, default=2(Area-based)
+        Interpolation method for all resizing operations
+
+        Possible values:
+        0: Nearest Neighbors Interpolation.
+        1: Bilinear interpolation.
+        2: Bicubic interpolation over 4x4 pixel neighborhood.
+        3: Area-based (resampling using pixel area relation). It may be a
+        preferred method for image decimation, as it gives moire-free
+        results. But when the image is zoomed, it is similar to the Nearest
+        Neighbors method. (used by default).
+        4: Lanczos interpolation over 8x8 pixel neighborhood.
+        10: Random select from interpolation method metioned above.
+        Note:
+        When shrinking an image, it will generally look best with AREA-based
+        interpolation, whereas, when enlarging an image, it will generally look best
+        with Bicubic (slow) or Bilinear (faster but still looks OK).
+
+    Examples
+    --------
+    >>> # An example of creating multiple augmenters
+    >>> augs = mx.gluon.contrib.data.create_image_augment(data_shape=(3, 300, 300), rand_mirror=True,
+    ...    mean=True, brightness=0.125, contrast=0.125, rand_gray=0.05,
+    ...    saturation=0.125, pca_noise=0.05, inter_method=10)
+    """
+    if inter_method == 10:
+        inter_method = np.random.randint(0, 5)
+    augmenter = HybridSequential('default_img_augment_')
+    if resize > 0:
+        augmenter.add(transforms.image.Resize(resize, interpolation=inter_method))
+    crop_size = (data_shape[2], data_shape[1])
+    if rand_resize:
+        assert rand_crop
+        augmenter.add(transforms.image.RandomResizedCrop(crop_size, interpolation=inter_method))
+    elif rand_crop:
+        augmenter.add(transforms.image.RandomCrop(crop_size, interpolation=inter_method))
+    else:
+        augmenter.add(transforms.image.CenterCrop(crop_size, interpolation=inter_method))
+
+    if rand_mirror:
+        augmenter.add(transforms.image.RandomFlipLeftRight(0.5))
+
+    augmenter.add(transforms.Cast())
+
+    if brightness or contrast or saturation or hue:
+        augmenter.add(transforms.image.RandomColorJitter(brightness, contrast, saturation, hue))
+
+    if pca_noise > 0:
+        augmenter.add(transforms.image.RandomLighting(pca_noise))
+
+    if rand_gray > 0:
+        augmenter.add(transforms.image.RandomGray(rand_gray))
+
+    if mean is True:
+        mean = [123.68, 116.28, 103.53]
+    elif mean is not None:
+        assert isinstance(mean, (tuple, list))
+
+    if std is True:
+        std = [58.395, 57.12, 57.375]
+    elif std is not None:
+        assert isinstance(std, (tuple, list))
+
+    augmenter.add(transforms.image.ToTensor())
+
+    if mean is not None or std is not None:
+        augmenter.add(transforms.image.Normalize(mean, std))
+
+    augmenter.add(transforms.Cast(dtype))
+
+    return augmenter
+
+class ImageDataLoader(object):
+    """Image data loader with a large number of augmentation choices.
+    This loader supports reading from both .rec files and raw image files.
+
+    To load input images from .rec files, use `path_imgrec` parameter and to load from raw image
+    files, use `path_imglist` and `path_root` parameters.
+
+    To use data partition (for distributed training) or shuffling, specify `path_imgidx` parameter.
+
+    Parameters
+    ----------
+    batch_size : int
+        Number of examples per batch.
+    data_shape : tuple
+        Data shape in (channels, height, width) format.
+        For now, only RGB image with 3 channels is supported.
+    path_imgrec : str
+        Path to image record file (.rec).
+        Created with tools/im2rec.py or bin/im2rec.
+    path_imglist : str
+        Path to image list (.lst).
+        Created with tools/im2rec.py or with custom script.
+        Format: Tab separated record of index, one or more labels and relative_path_from_root.
+    imglist: list
+        A list of images with the label(s).
+        Each item is a list [imagelabel: float or list of float, imgpath].
+    path_root : str
+        Root folder of image files.
+        Whether to shuffle all images at the start of each iteration or not.
+        Can be slow for HDD.
+    part_index : int
+        Partition index.
+    num_parts : int
+        Total number of partitions.
+    dtype : str
+        Label data type. Default: float32. Other options: int32, int64, float64
+    last_batch : {'keep', 'discard', 'rollover'}
+        How to handle the last batch if batch_size does not evenly divide
+        `len(dataset)`.
+
+        keep - A batch with less samples than previous batches is returned.
+        discard - The last batch is discarded if its incomplete.
+        rollover - The remaining samples are rolled over to the next epoch.
+    kwargs : ...
+        More arguments for creating augmenter. See mx.gluon.contrib.vision.dataloader.create_image_augment.
+    """
+    def __init__(self, batch_size, data_shape, path_imgrec=None, path_imglist=None, path_root='.',
+                 part_index=0, num_parts=1, aug_list=None, imglist=None,
+                 dtype='float32', shuffle=False, sampler=None,
+                 last_batch=None, batch_sampler=None, batchify_fn=None,
+                 num_workers=0, pin_memory=False, pin_device_id=0,
+                 prefetch=None, thread_pool=False, timeout=120, try_nopython=None,
+                 **kwargs):
+        assert path_imgrec or path_imglist or (isinstance(imglist, list))
+        assert dtype in ['int32', 'float32', 'int64', 'float64'], dtype + ' label not supported'
+        logging.info('Using %s workers for decoding...', str(num_workers))
+        logging.info('Set `num_workers` variable to a larger number to speed up loading'
+                     ' (it requires shared memory to work and may occupy more memory).')
+        class_name = self.__class__.__name__
+        if path_imgrec:
+            logging.info('%s: loading recordio %s...',
+                         class_name, path_imgrec)
+            from ....data.vision.datasets import ImageRecordDataset
+            dataset = ImageRecordDataset(path_imgrec, flag=1)
+        elif path_imglist:
+            logging.info('%s: loading image list %s...', class_name, path_imglist)
+            from ....data.vision.datasets import ImageListDataset
+            dataset = ImageListDataset(path_root, path_imglist, flag=1)
+        elif isinstance(imglist, list):
+            logging.info('%s: loading image list...', class_name)
+            from ....data.vision.datasets import ImageListDataset
+            dataset = ImageListDataset(path_root, imglist, flag=1)
+        else:
+            raise ValueError('Either path_imgrec, path_imglist, or imglist must be provided')
+
+        if num_parts > 1:
+            dataset = dataset.shard(num_parts, part_index)
+
+        if aug_list is None:
+            # apply default transforms
+            augmenter = create_image_augment(data_shape, **kwargs)
+        elif isinstance(aug_list, list):
+            if all([isinstance(a, HybridBlock) for a in aug_list]):
+                augmenter = HybridSequential('user_img_augment_')
+            else:
+                augmenter = Sequential('user_img_augment_')
+            for aug in aug_list:
+                augmenter.add(aug)
+        elif isinstance(aug_list, Block):
+            augmenter = aug_list
+        else:
+            raise ValueError('aug_list must be a list of Blocks or Block')
+        augmenter.hybridize()
+        self._iter = DataLoader(dataset.transform_first(augmenter), batch_size=batch_size,
+                                shuffle=shuffle, sampler=sampler, last_batch=last_batch,
+                                batch_sampler=batch_sampler, batchify_fn=batchify_fn,
+                                num_workers=num_workers, pin_memory=pin_memory,
+                                pin_device_id=pin_device_id, prefetch=prefetch,
+                                thread_pool=thread_pool, timeout=timeout, try_nopython=try_nopython)
+
+    def __iter__(self):
+        return iter(self._iter)
+
+    def __len__(self):
+        return len(self._iter)
+
+def create_bbox_augment(data_shape, rand_crop=0, rand_pad=0, rand_gray=0,
+                        rand_mirror=False, mean=None, std=None, brightness=0, contrast=0,
+                        saturation=0, pca_noise=0, hue=0, inter_method=2,
+                        max_aspect_ratio=2, area_range=(0.3, 3.0),
+                        max_attempts=50, pad_val=(127, 127, 127), dtype='float32'):
+    """Create augmenters for bbox/object detection.
+
+    Parameters
+    ----------
+    data_shape : tuple of int
+        Shape for output data
+    rand_crop : float
+        [0, 1], probability to apply random cropping
+    rand_pad : float
+        [0, 1], probability to apply random padding
+    rand_gray : float
+        [0, 1], probability to convert to grayscale for all channels
+    rand_mirror : bool
+        Whether to apply horizontal flip to image with probability 0.5
+    mean : np.ndarray or None
+        Mean pixel values for [r, g, b]
+    std : np.ndarray or None
+        Standard deviations for [r, g, b]
+    brightness : float
+        Brightness jittering range (percent)
+    contrast : float
+        Contrast jittering range (percent)
+    saturation : float
+        Saturation jittering range (percent)
+    hue : float
+        Hue jittering range (percent)
+    pca_noise : float
+        Pca noise level (percent)
+    inter_method : int, default=2(Area-based)
+        Interpolation method for all resizing operations
+
+        Possible values:
+        0: Nearest Neighbors Interpolation.
+        1: Bilinear interpolation.
+        2: Area-based (resampling using pixel area relation). It may be a
+        preferred method for image decimation, as it gives moire-free
+        results. But when the image is zoomed, it is similar to the Nearest
+        Neighbors method. (used by default).
+        3: Bicubic interpolation over 4x4 pixel neighborhood.
+        4: Lanczos interpolation over 8x8 pixel neighborhood.
+        10: Random select from interpolation method metioned above.
+        Note:
+        When shrinking an image, it will generally look best with AREA-based
+        interpolation, whereas, when enlarging an image, it will generally look best
+        with Bicubic (slow) or Bilinear (faster but still looks OK).
+    max_aspect_ratio : float
+        The cropped area of the image must have an aspect ratio = width / height
+        within this range.
+    area_range : tuple of floats
+        The cropped area of the image must contain a fraction of the supplied
+        image within in this range.
+    max_attempts : int
+        Number of attempts at generating a cropped/padded region of the image of the
+        specified constraints. After max_attempts failures, return the original image.
+    pad_val: float
+        Pixel value to be filled when padding is enabled. pad_val will automatically
+        be subtracted by mean and divided by std if applicable.
+
+    Examples
+    --------
+    >>> # An example of creating multiple augmenters
+    >>> augs = mx.gluon.contrib.data.create_bbox_augment(data_shape=(3, 300, 300), rand_crop=0.5,
+    ...    rand_pad=0.5, rand_mirror=True, mean=True, brightness=0.125, contrast=0.125,
+    ...    saturation=0.125, pca_noise=0.05, inter_method=10, min_object_covered=[0.3, 0.5, 0.9],
+    ...    area_range=(0.3, 3.0))
+    """
+    if inter_method == 10:
+        inter_method = np.random.randint(0, 5)
+    augmenter = Sequential('default_bbox_aug_')
+    if rand_crop > 0:
+        augmenter.add(bbox.ImageBboxRandomCropWithConstraints(
+            p=rand_crop, min_scale=area_range[0], max_scale=1.0,
+            max_aspect_ratio=max_aspect_ratio, max_trial=max_attempts))
+
+    if rand_mirror > 0:
+        augmenter.add(bbox.ImageBboxRandomFlipLeftRight(0.5))
+
+    if rand_pad > 0:
+        augmenter.add(bbox.ImageBboxRandomExpand(
+            p=rand_pad, max_ratio=area_range[1], fill=pad_val))
+
+    # force resize
+    augmenter.add(bbox.ImageBboxResize(data_shape[2], data_shape[1], interp=inter_method))
+
+    if brightness or contrast or saturation or hue:
+        augmenter.add(transforms.image.RandomColorJitter(
+            brightness=brightness, contrast=contrast, saturation=saturation, hue=hue))
+
+    if pca_noise > 0:
+        augmenter.add(transforms.image.RandomLighting(pca_noise))
+
+    if rand_gray > 0:
+        augmenter.add(transforms.image.RandomGray(rand_gray))
+
+    if mean is True:
+        mean = [123.68, 116.28, 103.53]
+    elif mean is not None:
+        assert isinstance(mean, (tuple, list))
+
+    if std is True:
+        std = [58.395, 57.12, 57.375]
+    elif std is not None:
+        assert isinstance(std, (tuple, list))
+
+    augmenter.add(transforms.image.ToTensor())
+    if mean is not None or std is not None:
+        augmenter.add(transforms.image.Normalize(mean, std))
+
+    augmenter.add(transforms.Cast(dtype))
+
+    return augmenter
+
+
+class ImageBboxDataLoader(object):
+    """Image iterator with a large number of augmentation choices for detection.
+
+    Parameters
+    ----------
+    batch_size : int
+        Number of examples per batch.
+    data_shape : tuple
+        Data shape in (channels, height, width) format.
+        For now, only RGB image with 3 channels is supported.
+    path_imgrec : str
+        Path to image record file (.rec).
+        Created with tools/im2rec.py or bin/im2rec.
+    path_imglist : str
+        Path to image list (.lst).
+        Created with tools/im2rec.py or with custom script.
+        Format: Tab separated record of index, one or more labels and relative_path_from_root.
+    imglist: list
+        A list of images with the label(s).
+        Each item is a list [imagelabel: float or list of float, imgpath].
+    path_root : str
+        Root folder of image files.
+    shuffle : bool
+        Whether to shuffle all images at the start of each iteration or not.
+        Can be slow for HDD.
+    aug_list : list or None
+        Augmenter list for generating distorted images
+    part_index : int
+        Partition index.
+    num_parts : int
+        Total number of partitions.
+    last_batch : {'keep', 'discard', 'rollover'}
+        How to handle the last batch if batch_size does not evenly divide
+        `len(dataset)`.
+
+        keep - A batch with less samples than previous batches is returned.
+        discard - The last batch is discarded if its incomplete.
+        rollover - The remaining samples are rolled over to the next epoch.
+    kwargs : ...
+        More arguments for creating augmenter. See mx.gluon.contrib.data.create_bbox_augment.
+    """
+    def __init__(self, batch_size, data_shape, path_imgrec=None, path_imglist=None, path_root='.',
+                 part_index=0, num_parts=1, aug_list=None, imglist=None,
+                 coord_normalized=True, dtype='float32', shuffle=False, sampler=None,
+                 last_batch=None, batch_sampler=None, batchify_fn=None,
+                 num_workers=0, pin_memory=False, pin_device_id=0,
+                 prefetch=None, thread_pool=False, timeout=120, try_nopython=None,
+                 **kwargs):
+        assert path_imgrec or path_imglist or (isinstance(imglist, list))
+        assert dtype in ['int32', 'float32', 'int64', 'float64'], dtype + ' label not supported'
+        logging.info('Using %s workers for decoding...', str(num_workers))
+        logging.info('Set `num_workers` variable to a larger number to speed up loading'
+                     ' (it requires shared memory to work and may occupy more memory).')
+        class_name = self.__class__.__name__
+        if path_imgrec:
+            logging.info('%s: loading recordio %s...',
+                         class_name, path_imgrec)
+            from ....data.vision.datasets import ImageRecordDataset
+            dataset = ImageRecordDataset(path_imgrec, flag=1)
+        elif path_imglist:
+            logging.info('%s: loading image list %s...', class_name, path_imglist)
+            from ....data.vision.datasets import ImageListDataset
+            dataset = ImageListDataset(path_root, path_imglist, flag=1)
+        elif isinstance(imglist, list):
+            logging.info('%s: loading image list...', class_name)
+            from ....data.vision.datasets import ImageListDataset
+            dataset = ImageListDataset(path_root, imglist, flag=1)
+        else:
+            raise ValueError('Either path_imgrec, path_imglist, or imglist must be provided')
+
+        if num_parts > 1:
+            dataset = dataset.shard(num_parts, part_index)
+
+        if aug_list is None:
+            # apply default transforms
+            augmenter = create_bbox_augment(data_shape, **kwargs)
+        elif isinstance(aug_list, list):
+            if all([isinstance(a, HybridBlock) for a in aug_list]):
+                augmenter = HybridSequential('user_bbox_augment_')
+            else:
+                augmenter = Sequential('user_bbox_augment_')
+            for aug in aug_list:
+                augmenter.add(aug)
+        elif isinstance(aug_list, Block):
+            augmenter = aug_list
+        else:
+            raise ValueError('aug_list must be a list of Blocks')
+        augmenter.hybridize()
+        wrapper_aug = Sequential('wrapper_bbox_aug_')
+        wrapper_aug.add(BboxLabelTransform(coord_normalized))
+        wrapper_aug.add(augmenter)
+
+        if batchify_fn is None:
+            from ....data.batchify import Stack, Pad, Group
+            pad_batchify = Pad(val=-1)
+            pad_batchify._warned = True
+            batchify_fn = Group(Stack(), pad_batchify)  # stack image, pad bbox
+        self._iter = DataLoader(dataset.transform(wrapper_aug), batch_size=batch_size,
+                                shuffle=shuffle, sampler=sampler, last_batch=last_batch,
+                                batch_sampler=batch_sampler, batchify_fn=batchify_fn,
+                                num_workers=num_workers, pin_memory=pin_memory,
+                                pin_device_id=pin_device_id, prefetch=prefetch,
+                                thread_pool=thread_pool, timeout=timeout, try_nopython=try_nopython)
+
+    def __iter__(self):
+        return iter(self._iter)
+
+    def __len__(self):
+        return len(self._iter)
+
+class BboxLabelTransform(Block):
+    """Transform to convert 1-D bbox label to 2-D as in shape Nx5.
+
+    Parameters
+    ----------
+    coord_normalized : bool
+        Whether the coordinates(x0, y0, x1, y1) are normalized to (0, 1).
+
+    """
+    def __init__(self, coord_normalized=True):
+        super(BboxLabelTransform, self).__init__()
+        self._coord_normalized = coord_normalized
+
+    def forward(self, img, label):
+        """transform 1-D bbox label to Nx5 ndarray"""
+        if self._coord_normalized:
+            height = img.shape[0]
+            width = img.shape[1]
+        else:
+            height = width = None
+        if not isinstance(label, np.ndarray):
+            label = label.asnumpy()
+        label = label.flatten()
+        header_len = int(label[0])  # label header
+        label_width = int(label[1])  # the label width for each object, >= 5
+        if label_width < 5:
+            raise ValueError(
+                "Label info for each object should >= 5, given {}".format(label_width))
+        min_len = header_len + 5
+        if len(label) < min_len:
+            raise ValueError(
+                "Expected label length >= {}, got {}".format(min_len, len(label)))
+        if (len(label) - header_len) % label_width:
+            raise ValueError(
+                "Broken label of size {}, cannot reshape into (N, {}) "
+                "if header length {} is excluded".format(len(label), label_width, header_len))
+        bbox_label = label[header_len:].reshape(-1, label_width)
+        # swap columns, requires [xmin-ymin-xmax-ymax-id-extra0-extra1-xxx]
+        ids = bbox_label[:, 0].copy()
+        bbox_label[:, :4] = bbox_label[:, 1:5]
+        bbox_label[:, 4] = ids
+        # restore to absolute coordinates
+        if width is not None:
+            bbox_label[:, (0, 2)] *= width
+        if height is not None:
+            bbox_label[:, (1, 3)] *= height
+        array_fn = _mx_np.array if is_np_array() else nd.array
+        return img, array_fn(bbox_label)
diff --git a/python/mxnet/gluon/contrib/data/vision/transforms/__init__.py b/python/mxnet/gluon/contrib/data/vision/transforms/__init__.py
new file mode 100644
index 000000000000..f9ed3fef562a
--- /dev/null
+++ b/python/mxnet/gluon/contrib/data/vision/transforms/__init__.py
@@ -0,0 +1,21 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=wildcard-import
+"""Contrib vision transforms."""
+from .bbox import *
diff --git a/python/mxnet/gluon/contrib/data/vision/transforms/bbox/__init__.py b/python/mxnet/gluon/contrib/data/vision/transforms/bbox/__init__.py
new file mode 100644
index 000000000000..c3496b7086ef
--- /dev/null
+++ b/python/mxnet/gluon/contrib/data/vision/transforms/bbox/__init__.py
@@ -0,0 +1,26 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+all: html
+
+html:
+	mkdir -p build/html
+	doxygen Doxyfile
+
+
+clean:
+	rm -rf build
diff --git a/python/mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py b/python/mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py
new file mode 100644
index 000000000000..42cfb7afaefb
--- /dev/null
+++ b/python/mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py
@@ -0,0 +1,344 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable= arguments-differ, wildcard-import
+"Bounding box transforms."
+import random
+
+from .......base import numeric_types
+from ......block import Block
+from .......util import is_np_array
+from ....... import nd, npx, np
+from .utils import _check_bbox_shape, bbox_crop, bbox_translate
+from .utils import bbox_resize, bbox_random_crop_with_constraints
+
+__all__ = ['ImageBboxRandomFlipLeftRight', 'ImageBboxCrop',
+           'ImageBboxRandomCropWithConstraints', 'ImageBboxResize']
+
+
+class ImageBboxRandomFlipLeftRight(Block):
+    """Randomly flip the input image and bbox left to right with a probability
+    of p(0.5 by default).
+
+    Parameters
+    ----------
+    p : float
+        The probability to preceed with random cropping logic.
+
+    Inputs:
+        - **data**: input tensor with (Hi x Wi x C) shape.
+        - **bbox**: input tensor with shape (N, 4+) where N is the number of bounding boxes.
+            The second axis represents attributes of the bounding box.
+            Specifically, these are :math:`(x_{min}, y_{min}, x_{max}, y_{max})`,
+            we allow additional attributes other than coordinates, which stay intact
+            during bounding box transformations.
+
+    Outputs:
+        - **out**: output tensor with same shape as `data`.
+        - **bbox**: input tensor with same shape as `bbox`.
+    """
+    def __init__(self, p=0.5):
+        super(ImageBboxRandomFlipLeftRight, self).__init__()
+        self.p = p
+
+    def forward(self, img, bbox):
+        _check_bbox_shape(bbox)
+        if self.p <= 0:
+            return img, bbox
+        elif self.p >= 1:
+            img = self._flip_image(img)
+            bbox = self._flip_bbox(img, bbox)
+            return img, bbox
+        else:
+            if self.p < random.random():
+                return img, bbox
+            else:
+                img = self._flip_image(img)
+                bbox = self._flip_bbox(img, bbox)
+                return img, bbox
+
+    def _flip_image(self, img):
+        if is_np_array():
+            return npx.image.flip_left_right(img)
+        else:
+            return nd.image.flip_left_right(img)
+
+    def _flip_bbox(self, img, bbox):
+        width = img.shape[-2]
+        xmax = width - bbox[:, 0]
+        xmin = width - bbox[:, 2]
+        bbox[:, 0] = xmin
+        bbox[:, 2] = xmax
+        return bbox
+
+
+class ImageBboxCrop(Block):
+    """Crops the image `src` and `bbox` to the given `crop`.
+
+    Parameters
+    ----------
+    crop_box : tuple
+        Tuple of length 4. :math:`(x_{min}, y_{min}, width, height)`
+    allow_outside_center : bool
+        If `False`, remove bounding boxes which have centers outside cropping area.
+
+
+    Inputs:
+        - **data**: input tensor with (Hi x Wi x C) shape.
+        - **bbox**: input tensor with shape (N, 4+) where N is the number of bounding boxes.
+            The second axis represents attributes of the bounding box.
+            Specifically, these are :math:`(x_{min}, y_{min}, x_{max}, y_{max})`,
+            we allow additional attributes other than coordinates, which stay intact
+            during bounding box transformations.
+
+    Outputs:
+        - **out**: output tensor with (H x W x C) shape.
+        - **bbox**: output tensor with shape (M, 4+) where M <= N is the number of valid bounding
+            boxes after cropping. :math:`(x_{min}, y_{min}, x_{max}, y_{max})`
+
+    """
+    def __init__(self, crop, allow_outside_center=False):
+        super(ImageBboxCrop, self).__init__()
+        assert len(crop) == 4, "expect crop to be (x_min, y_min, x_max, y_max)"
+        self.xmin = crop[0]
+        self.ymin = crop[1]
+        self.width = crop[2]
+        self.height = crop[3]
+        assert self.xmin >= 0
+        assert self.ymin >= 0
+        assert self.width > 0
+        assert self.height > 0
+        self.xmax = self.width + self.xmin
+        self.ymax = self.height + self.ymin
+        self._allow_outside_center = allow_outside_center
+
+    def forward(self, img, bbox):
+        if self.xmax >= img.shape[-2] or self.ymax >= img.shape[-3]:
+            return img, bbox
+        if is_np_array():
+            new_img = npx.image.crop(img, self.xmin, self.ymin, self.width, self.height)
+            new_bbox = np.array(bbox_crop(bbox.asnumpy(),
+                                          (self.xmin, self.ymin, self.width, self.height),
+                                          self._allow_outside_center))
+        else:
+            new_img = nd.image.crop(img, self.xmin, self.ymin, self.width, self.height)
+            new_bbox = nd.array(bbox_crop(bbox.asnumpy(),
+                                          (self.xmin, self.ymin, self.width, self.height),
+                                          self._allow_outside_center))
+        return new_img, new_bbox
+
+
+class ImageBboxRandomCropWithConstraints(Block):
+    """Crop an image randomly with bounding box constraints.
+
+    Please check `mx.gluon.contrib.data.transforms.bbox.utils.bbox_random_crop_with_constraints`
+    for implementation details.
+
+    Parameters
+    ----------
+    p : float
+        The probability to preceed with random cropping logic.
+    min_scale : float
+        The minimum ratio between a cropped region and the original image.
+        The default value is :obj:`0.3`.
+    max_scale : float
+        The maximum ratio between a cropped region and the original image.
+        The default value is :obj:`1`.
+    max_aspect_ratio : float
+        The maximum aspect ratio of cropped region.
+        The default value is :obj:`2`.
+    constraints : iterable of tuples
+        An iterable of constraints.
+        Each constraint should be :obj:`(min_iou, max_iou)` format.
+        If means no constraint if set :obj:`min_iou` or :obj:`max_iou` to :obj:`None`.
+        If this argument defaults to :obj:`None`, :obj:`((0.1, None), (0.3, None),
+        (0.5, None), (0.7, None), (0.9, None), (None, 1))` will be used.
+    max_trial : int
+        Maximum number of trials for each constraint before exit no matter what.
+
+    Inputs:
+        - **data**: input tensor with (Hi x Wi x C) shape.
+        - **bbox**: input tensor with shape (N, 4+) where N is the number of bounding boxes.
+            The second axis represents attributes of the bounding box.
+            Specifically, these are :math:`(x_{min}, y_{min}, x_{max}, y_{max})`,
+            we allow additional attributes other than coordinates, which stay intact
+            during bounding box transformations.
+
+    Outputs:
+        - **out**: Cropped image with shape (H x W x C)
+        - **bbox**: Cropped bounding boxes with shape :obj:`(M, 4+)` where M <= N.
+            Tuple of length 4 as :math:`(x_{min}, y_{min}, x_{max}, y_{max})`.
+    """
+    def __init__(self, p=0.5, min_scale=0.3, max_scale=1,
+                 max_aspect_ratio=2, constraints=None,
+                 max_trial=50):
+        super(ImageBboxRandomCropWithConstraints, self).__init__()
+        self.p = p
+        self._args = {
+            "min_scale": min_scale,
+            "max_scale": max_scale,
+            "max_aspect_ratio": max_aspect_ratio,
+            "constraints": constraints,
+            "max_trial": max_trial
+        }
+
+    def forward(self, img, bbox):
+        if random.random() > self.p:
+            return img, bbox
+        im_size = (img.shape[-2], img.shape[-3])
+        new_bbox, crop = bbox_random_crop_with_constraints(bbox.asnumpy(), im_size, **self._args)
+        if crop == (0, 0, im_size[0], im_size[1]):
+            return img, bbox
+        if is_np_array():
+            new_img = npx.image.crop(img, x=crop[0], y=crop[1], width=crop[2], height=crop[3])
+            new_bbox = np.array(new_bbox)
+        else:
+            new_img = nd.image.crop(img, x=crop[0], y=crop[1], width=crop[2], height=crop[3])
+            new_bbox = nd.array(new_bbox)
+        return new_img, new_bbox
+
+
+class ImageBboxRandomExpand(Block):
+    """Randomly expand image to a larger region with padded pixels.
+    Apply tranlation to bounding boxes accordingly.
+
+    Parameters
+    ----------
+    p : float
+        The probability to preceed with random cropping logic.
+    max_ratio : float
+        The minimum expansion ratio. If `max_ratio` is 2, the range of
+        output image size is 1x ~ 2x of the original input size.
+    fill : float or tuple of float
+        The value(s) for the pixels in expanded regions. Can be scalar or tuple,
+        note the if tuple is provided, its size must match the image channels, typically 3.
+    keep_ratio : bool
+        If `True`, the output must have the same aspect ratio as input, otherwise the output
+        can have arbitrary aspect ratio.
+
+    Inputs:
+        - **data**: input tensor with (Hi x Wi x C) shape.
+        - **bbox**: input tensor with shape (N, 4+) where N is the number of bounding boxes.
+            The second axis represents attributes of the bounding box.
+            Specifically, these are :math:`(x_{min}, y_{min}, x_{max}, y_{max})`,
+            we allow additional attributes other than coordinates, which stay intact
+            during bounding box transformations.
+
+    Outputs:
+        - **out**: Cropped image with shape (H x W x C)
+        - **bbox**: Cropped bounding boxes with shape :obj:`(N, 4+)`.
+            Tuple of length 4 as :math:`(x_{min}, y_{min}, x_{max}, y_{max})`.
+
+    """
+    def __init__(self, p=0.5, max_ratio=4, fill=0, keep_ratio=True):
+        super(ImageBboxRandomExpand, self).__init__()
+        self.p = p
+        self._max_ratio = max_ratio
+        self._fill = fill
+        self._keep_ratio = keep_ratio
+
+    def forward(self, img, bbox):
+        if self._max_ratio <= 1 or random.random() > self.p:
+            return img, bbox
+        if len(img.shape) != 3:
+            raise NotImplementedError('ImageBboxRandomExpand only support images in HWC format')
+
+        h, w, c = img.shape
+        ratio_x = random.uniform(1, self._max_ratio)
+        if self._keep_ratio:
+            ratio_y = ratio_x
+        else:
+            ratio_y = random.uniform(1, self._max_ratio)
+
+        oh, ow = int(h * ratio_y), int(w * ratio_x)
+        off_y = random.randint(0, oh - h)
+        off_x = random.randint(0, ow - w)
+
+        # make canvas
+        if is_np_array():
+            F = np
+        else:
+            F = nd
+        if isinstance(self._fill, numeric_types):
+            dst = F.full(shape=(oh, ow, c), val=self._fill, dtype=img.dtype)
+        else:
+            fill = F.array(self._fill, dtype=img.dtype, ctx=img.context)
+            if not c == fill.size:
+                raise ValueError("Channel and fill size mismatch, {} vs {}".format(c, fill.size))
+            dst = F.tile(fill.reshape((1, c)), reps=(oh * ow, 1)).reshape((oh, ow, c))
+
+        dst[off_y:off_y+h, off_x:off_x+w, :] = img
+
+        # translate bbox
+        new_bbox = bbox_translate(bbox.asnumpy(), off_x, off_y)
+        if is_np_array():
+            new_bbox = np.array(new_bbox)
+        else:
+            new_bbox = nd.array(new_bbox)
+
+        return dst, new_bbox
+
+
+class ImageBboxResize(Block):
+    """Apply resize to image and bounding boxes.
+
+    Parameters
+    ----------
+    width : int
+        The target output width.
+    height : int
+        The target output height.
+
+    Inputs:
+        - **data**: input tensor with (Hi x Wi x C) shape.
+        - **bbox**: input tensor with shape (N, 4+) where N is the number of bounding boxes.
+            The second axis represents attributes of the bounding box.
+            Specifically, these are :math:`(x_{min}, y_{min}, x_{max}, y_{max})`,
+            we allow additional attributes other than coordinates, which stay intact
+            during bounding box transformations.
+
+    Outputs:
+        - **out**: Cropped image with shape (H x W x C)
+        - **bbox**: Cropped bounding boxes with shape :obj:`(M, 4+)` where M <= N.
+            Tuple of length 4 as :math:`(x_{min}, y_{min}, x_{max}, y_{max})`.
+
+    """
+    def __init__(self, width, height, interp=1):
+        super(ImageBboxResize, self).__init__()
+        self._size = (width, height)
+        self._interp = interp
+
+    def forward(self, img, bbox):
+        if len(img.shape) != 3:
+            raise NotImplementedError('ImageBboxResize only support images in HWC format')
+
+        if self._interp == -1:
+            # random interpolation mode
+            interp = random.randint(0, 5)
+        else:
+            interp = self._interp
+
+        if is_np_array():
+            new_img = npx.image.resize(img, self._size, False, interp)
+            new_bbox = np.array(bbox_resize(bbox.asnumpy(),
+                                            (img.shape[-2], img.shape[-3]), self._size))
+        else:
+            new_img = nd.image.resize(img, self._size, False, interp)
+            new_bbox = nd.array(bbox_resize(bbox.asnumpy(),
+                                            (img.shape[-2], img.shape[-3]), self._size))
+        return new_img, new_bbox
diff --git a/python/mxnet/gluon/contrib/data/vision/transforms/bbox/utils.py b/python/mxnet/gluon/contrib/data/vision/transforms/bbox/utils.py
new file mode 100644
index 000000000000..cba374e0464f
--- /dev/null
+++ b/python/mxnet/gluon/contrib/data/vision/transforms/bbox/utils.py
@@ -0,0 +1,428 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable= arguments-differ, wildcard-import
+"Bounding box utilities."
+from __future__ import division
+import random
+
+import numpy as np
+
+def _check_bbox_shape(bbox):
+    assert len(bbox.shape) == 2, "bbox requires shape of (N, 4+), given: {}".format(bbox.shape)
+    assert bbox.shape[1] >= 4, "bbox requires shape of (N, 4+), given: {}".format(bbox.shape)
+
+def bbox_crop(bbox, crop_box=None, allow_outside_center=True):
+    """Crop bounding boxes according to slice area.
+    This method is mainly used with image cropping to ensure bonding boxes fit
+    within the cropped image.
+
+    Parameters
+    ----------
+    bbox : numpy.ndarray
+        Numpy.ndarray with shape (N, 4+) where N is the number of bounding boxes.
+        The second axis represents attributes of the bounding box.
+        Specifically, these are :math:`(x_{min}, y_{min}, x_{max}, y_{max})`,
+        we allow additional attributes other than coordinates, which stay intact
+        during bounding box transformations.
+    crop_box : tuple
+        Tuple of length 4. :math:`(x_{min}, y_{min}, width, height)`
+    allow_outside_center : bool
+        If `False`, remove bounding boxes which have centers outside cropping area.
+    Returns
+    -------
+    numpy.ndarray
+        Cropped bounding boxes with shape (M, 4+) where M <= N.
+    """
+    bbox = bbox.copy()
+    if crop_box is None:
+        return bbox
+    if not len(crop_box) == 4:
+        raise ValueError(
+            "Invalid crop_box parameter, requires length 4, given {}".format(str(crop_box)))
+    if sum([int(c is None) for c in crop_box]) == 4:
+        return bbox
+
+    l, t, w, h = crop_box
+
+    left = l if l else 0
+    top = t if t else 0
+    right = left + (w if w else np.inf)
+    bottom = top + (h if h else np.inf)
+    crop_bbox = np.array((left, top, right, bottom))
+
+    if allow_outside_center:
+        mask = np.ones(bbox.shape[0], dtype=bool)
+    else:
+        centers = (bbox[:, :2] + bbox[:, 2:4]) / 2
+        mask = ((crop_bbox[:2] <= centers) * (centers < crop_bbox[2:])).all(axis=1)
+
+    # transform borders
+    bbox[:, :2] = np.maximum(bbox[:, :2], crop_bbox[:2])
+    bbox[:, 2:4] = np.minimum(bbox[:, 2:4], crop_bbox[2:4])
+    bbox[:, :2] -= crop_bbox[:2]
+    bbox[:, 2:4] -= crop_bbox[:2]
+
+    mask = (mask * (bbox[:, :2] < bbox[:, 2:4]).all(axis=1))
+    bbox = bbox[mask]
+    return bbox
+
+def bbox_flip(bbox, size, flip_x=False, flip_y=False):
+    """Flip bounding boxes according to image flipping directions.
+
+    Parameters
+    ----------
+    bbox : numpy.ndarray
+        Numpy.ndarray with shape (N, 4+) where N is the number of bounding boxes.
+        The second axis represents attributes of the bounding box.
+        Specifically, these are :math:`(x_{min}, y_{min}, x_{max}, y_{max})`,
+        we allow additional attributes other than coordinates, which stay intact
+        during bounding box transformations.
+    size : tuple
+        Tuple of length 2: (width, height).
+    flip_x : bool
+        Whether flip horizontally.
+    flip_y : bool
+        Whether flip vertically.
+
+    Returns
+    -------
+    numpy.ndarray
+        Flipped bounding boxes with original shape.
+    """
+    if not len(size) == 2:
+        raise ValueError("size requires length 2 tuple, given {}".format(len(size)))
+    width, height = size
+    bbox = bbox.copy()
+    if flip_y:
+        ymax = height - bbox[:, 1]
+        ymin = height - bbox[:, 3]
+        bbox[:, 1] = ymin
+        bbox[:, 3] = ymax
+    if flip_x:
+        xmax = width - bbox[:, 0]
+        xmin = width - bbox[:, 2]
+        bbox[:, 0] = xmin
+        bbox[:, 2] = xmax
+    return bbox
+
+def bbox_resize(bbox, in_size, out_size):
+    """Resize bouding boxes according to image resize operation.
+
+    Parameters
+    ----------
+    bbox : numpy.ndarray
+        Numpy.ndarray with shape (N, 4+) where N is the number of bounding boxes.
+        The second axis represents attributes of the bounding box.
+        Specifically, these are :math:`(x_{min}, y_{min}, x_{max}, y_{max})`,
+        we allow additional attributes other than coordinates, which stay intact
+        during bounding box transformations.
+    in_size : tuple
+        Tuple of length 2: (width, height) for input.
+    out_size : tuple
+        Tuple of length 2: (width, height) for output.
+
+    Returns
+    -------
+    numpy.ndarray
+        Resized bounding boxes with original shape.
+    """
+    if not len(in_size) == 2:
+        raise ValueError("in_size requires length 2 tuple, given {}".format(len(in_size)))
+    if not len(out_size) == 2:
+        raise ValueError("out_size requires length 2 tuple, given {}".format(len(out_size)))
+
+    bbox = bbox.copy().astype(float)
+    x_scale = out_size[0] / in_size[0]
+    y_scale = out_size[1] / in_size[1]
+    bbox[:, 1] = y_scale * bbox[:, 1]
+    bbox[:, 3] = y_scale * bbox[:, 3]
+    bbox[:, 0] = x_scale * bbox[:, 0]
+    bbox[:, 2] = x_scale * bbox[:, 2]
+    return bbox
+
+def bbox_translate(bbox, x_offset=0, y_offset=0):
+    """Translate bounding boxes by offsets.
+
+    Parameters
+    ----------
+    bbox : numpy.ndarray
+        Numpy.ndarray with shape (N, 4+) where N is the number of bounding boxes.
+        The second axis represents attributes of the bounding box.
+        Specifically, these are :math:`(x_{min}, y_{min}, x_{max}, y_{max})`,
+        we allow additional attributes other than coordinates, which stay intact
+        during bounding box transformations.
+    x_offset : int or float
+        Offset along x axis.
+    y_offset : int or float
+        Offset along y axis.
+
+    Returns
+    -------
+    numpy.ndarray
+        Translated bounding boxes with original shape.
+    """
+    bbox = bbox.copy()
+    bbox[:, :2] += (x_offset, y_offset)
+    bbox[:, 2:4] += (x_offset, y_offset)
+    return bbox
+
+def bbox_iou(bbox_a, bbox_b, offset=0):
+    """Calculate Intersection-Over-Union(IOU) of two bounding boxes.
+
+    Parameters
+    ----------
+    bbox_a : numpy.ndarray
+        An ndarray with shape :math:`(N, 4)`.
+    bbox_b : numpy.ndarray
+        An ndarray with shape :math:`(M, 4)`.
+    offset : float or int, default is 0
+        The ``offset`` is used to control the whether the width(or height) is computed as
+        (right - left + ``offset``).
+        Note that the offset must be 0 for normalized bboxes, whose ranges are in ``[0, 1]``.
+
+    Returns
+    -------
+    numpy.ndarray
+        An ndarray with shape :math:`(N, M)` indicates IOU between each pairs of
+        bounding boxes in `bbox_a` and `bbox_b`.
+
+    """
+    if bbox_a.shape[1] < 4 or bbox_b.shape[1] < 4:
+        raise IndexError("Bounding boxes axis 1 must have at least length 4")
+
+    tl = np.maximum(bbox_a[:, None, :2], bbox_b[:, :2])
+    br = np.minimum(bbox_a[:, None, 2:4], bbox_b[:, 2:4])
+
+    area_i = np.prod(br - tl + offset, axis=2) * (tl < br).all(axis=2)
+    area_a = np.prod(bbox_a[:, 2:4] - bbox_a[:, :2] + offset, axis=1)
+    area_b = np.prod(bbox_b[:, 2:4] - bbox_b[:, :2] + offset, axis=1)
+    return area_i / (area_a[:, None] + area_b - area_i)
+
+
+def bbox_xywh_to_xyxy(xywh):
+    """Convert bounding boxes from format (xmin, ymin, w, h) to (xmin, ymin, xmax, ymax)
+
+    Parameters
+    ----------
+    xywh : list, tuple or numpy.ndarray
+        The bbox in format (x, y, w, h).
+        If numpy.ndarray is provided, we expect multiple bounding boxes with
+        shape `(N, 4)`.
+
+    Returns
+    -------
+    tuple or numpy.ndarray
+        The converted bboxes in format (xmin, ymin, xmax, ymax).
+        If input is numpy.ndarray, return is numpy.ndarray correspondingly.
+
+    """
+    if isinstance(xywh, (tuple, list)):
+        if not len(xywh) == 4:
+            raise IndexError(
+                "Bounding boxes must have 4 elements, given {}".format(len(xywh)))
+        w, h = np.maximum(xywh[2] - 1, 0), np.maximum(xywh[3] - 1, 0)
+        return xywh[0], xywh[1], xywh[0] + w, xywh[1] + h
+    elif isinstance(xywh, np.ndarray):
+        if not xywh.size % 4 == 0:
+            raise IndexError(
+                "Bounding boxes must have n * 4 elements, given {}".format(xywh.shape))
+        xyxy = np.hstack((xywh[:, :2], xywh[:, :2] + np.maximum(0, xywh[:, 2:4] - 1)))
+        return xyxy
+    else:
+        raise TypeError(
+            'Expect input xywh a list, tuple or numpy.ndarray, given {}'.format(type(xywh)))
+
+
+def bbox_xyxy_to_xywh(xyxy):
+    """Convert bounding boxes from format (xmin, ymin, xmax, ymax) to (x, y, w, h).
+
+    Parameters
+    ----------
+    xyxy : list, tuple or numpy.ndarray
+        The bbox in format (xmin, ymin, xmax, ymax).
+        If numpy.ndarray is provided, we expect multiple bounding boxes with
+        shape `(N, 4)`.
+
+    Returns
+    -------
+    tuple or numpy.ndarray
+        The converted bboxes in format (x, y, w, h).
+        If input is numpy.ndarray, return is numpy.ndarray correspondingly.
+
+    """
+    if isinstance(xyxy, (tuple, list)):
+        if not len(xyxy) == 4:
+            raise IndexError(
+                "Bounding boxes must have 4 elements, given {}".format(len(xyxy)))
+        x1, y1 = xyxy[0], xyxy[1]
+        w, h = xyxy[2] - x1 + 1, xyxy[3] - y1 + 1
+        return x1, y1, w, h
+    elif isinstance(xyxy, np.ndarray):
+        if not xyxy.size % 4 == 0:
+            raise IndexError(
+                "Bounding boxes must have n * 4 elements, given {}".format(xyxy.shape))
+        return np.hstack((xyxy[:, :2], xyxy[:, 2:4] - xyxy[:, :2] + 1))
+    else:
+        raise TypeError(
+            'Expect input xywh a list, tuple or numpy.ndarray, given {}'.format(type(xyxy)))
+
+
+def bbox_clip_xyxy(xyxy, width, height):
+    """Clip bounding box with format (xmin, ymin, xmax, ymax) to specified boundary.
+
+    All bounding boxes will be clipped to the new region `(0, 0, width, height)`.
+
+    Parameters
+    ----------
+    xyxy : list, tuple or numpy.ndarray
+        The bbox in format (xmin, ymin, xmax, ymax).
+        If numpy.ndarray is provided, we expect multiple bounding boxes with
+        shape `(N, 4)`.
+    width : int or float
+        Boundary width.
+    height : int or float
+        Boundary height.
+
+    Returns
+    -------
+    type
+        Description of returned object.
+
+    """
+    if isinstance(xyxy, (tuple, list)):
+        if not len(xyxy) == 4:
+            raise IndexError(
+                "Bounding boxes must have 4 elements, given {}".format(len(xyxy)))
+        x1 = np.minimum(width - 1, np.maximum(0, xyxy[0]))
+        y1 = np.minimum(height - 1, np.maximum(0, xyxy[1]))
+        x2 = np.minimum(width - 1, np.maximum(0, xyxy[2]))
+        y2 = np.minimum(height - 1, np.maximum(0, xyxy[3]))
+        return x1, y1, x2, y2
+    elif isinstance(xyxy, np.ndarray):
+        if not xyxy.size % 4 == 0:
+            raise IndexError(
+                "Bounding boxes must have n * 4 elements, given {}".format(xyxy.shape))
+        x1 = np.minimum(width - 1, np.maximum(0, xyxy[:, 0]))
+        y1 = np.minimum(height - 1, np.maximum(0, xyxy[:, 1]))
+        x2 = np.minimum(width - 1, np.maximum(0, xyxy[:, 2]))
+        y2 = np.minimum(height - 1, np.maximum(0, xyxy[:, 3]))
+        return np.hstack((x1, y1, x2, y2))
+    else:
+        raise TypeError(
+            'Expect input xywh a list, tuple or numpy.ndarray, given {}'.format(type(xyxy)))
+
+def bbox_random_crop_with_constraints(bbox, size, min_scale=0.3, max_scale=1,
+                                      max_aspect_ratio=2, constraints=None,
+                                      max_trial=50):
+    """Crop an image randomly with bounding box constraints.
+
+    This data augmentation is used in training of
+    Single Shot Multibox Detector [#]_. More details can be found in
+    data augmentation section of the original paper.
+    .. [#] Wei Liu, Dragomir Anguelov, Dumitru Erhan, Christian Szegedy,
+       Scott Reed, Cheng-Yang Fu, Alexander C. Berg.
+       SSD: Single Shot MultiBox Detector. ECCV 2016.
+
+    Parameters
+    ----------
+    bbox : numpy.ndarray
+        Numpy.ndarray with shape (N, 4+) where N is the number of bounding boxes.
+        The second axis represents attributes of the bounding box.
+        Specifically, these are :math:`(x_{min}, y_{min}, x_{max}, y_{max})`,
+        we allow additional attributes other than coordinates, which stay intact
+        during bounding box transformations.
+    size : tuple
+        Tuple of length 2 of image shape as (width, height).
+    min_scale : float
+        The minimum ratio between a cropped region and the original image.
+        The default value is :obj:`0.3`.
+    max_scale : float
+        The maximum ratio between a cropped region and the original image.
+        The default value is :obj:`1`.
+    max_aspect_ratio : float
+        The maximum aspect ratio of cropped region.
+        The default value is :obj:`2`.
+    constraints : iterable of tuples
+        An iterable of constraints.
+        Each constraint should be :obj:`(min_iou, max_iou)` format.
+        If means no constraint if set :obj:`min_iou` or :obj:`max_iou` to :obj:`None`.
+        If this argument defaults to :obj:`None`, :obj:`((0.1, None), (0.3, None),
+        (0.5, None), (0.7, None), (0.9, None), (None, 1))` will be used.
+    max_trial : int
+        Maximum number of trials for each constraint before exit no matter what.
+
+    Returns
+    -------
+    numpy.ndarray
+        Cropped bounding boxes with shape :obj:`(M, 4+)` where M <= N.
+    tuple
+        Tuple of length 4 as (x_offset, y_offset, new_width, new_height).
+
+    """
+    # default params in paper
+    if constraints is None:
+        constraints = (
+            (0.1, None),
+            (0.3, None),
+            (0.5, None),
+            (0.7, None),
+            (0.9, None),
+            (None, 1),
+        )
+
+    w, h = size
+
+    candidates = [(0, 0, w, h)]
+    for min_iou, max_iou in constraints:
+        min_iou = -np.inf if min_iou is None else min_iou
+        max_iou = np.inf if max_iou is None else max_iou
+
+        for _ in range(max_trial):
+            scale = random.uniform(min_scale, max_scale)
+            aspect_ratio = random.uniform(
+                max(1 / max_aspect_ratio, scale * scale),
+                min(max_aspect_ratio, 1 / (scale * scale)))
+            crop_h = int(h * scale / np.sqrt(aspect_ratio))
+            crop_w = int(w * scale * np.sqrt(aspect_ratio))
+
+            crop_t = random.randrange(h - crop_h)
+            crop_l = random.randrange(w - crop_w)
+            crop_bb = np.array((crop_l, crop_t, crop_l + crop_w, crop_t + crop_h))
+
+            if len(bbox) == 0:
+                top, bottom = crop_t, crop_t + crop_h
+                left, right = crop_l, crop_l + crop_w
+                return bbox, (left, top, right-left, bottom-top)
+
+            iou = bbox_iou(bbox, crop_bb[np.newaxis])
+            if min_iou <= iou.min() and iou.max() <= max_iou:
+                top, bottom = crop_t, crop_t + crop_h
+                left, right = crop_l, crop_l + crop_w
+                candidates.append((left, top, right-left, bottom-top))
+                break
+
+    # random select one
+    while candidates:
+        crop = candidates.pop(np.random.randint(0, len(candidates)))
+        new_bbox = bbox_crop(bbox, crop, allow_outside_center=False)
+        if new_bbox.size < 1:
+            continue
+        new_crop = (crop[0], crop[1], crop[2], crop[3])
+        return new_bbox, new_crop
+    return bbox, (0, 0, w, h)
diff --git a/python/mxnet/gluon/data/__init__.py b/python/mxnet/gluon/data/__init__.py
index 23ae3e9b3be6..b20fad3c977b 100644
--- a/python/mxnet/gluon/data/__init__.py
+++ b/python/mxnet/gluon/data/__init__.py
@@ -26,3 +26,5 @@
 from .dataloader import *
 
 from . import vision
+
+from . import _internal
diff --git a/python/mxnet/gluon/data/_internal.py b/python/mxnet/gluon/data/_internal.py
new file mode 100644
index 000000000000..925cf5453527
--- /dev/null
+++ b/python/mxnet/gluon/data/_internal.py
@@ -0,0 +1,353 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+"""C++ Datasets for common data formats."""
+import sys
+import ctypes
+
+from .dataset import Dataset
+from .sampler import Sampler
+from ...base import _LIB
+from ...base import c_str_array, mx_uint, py_str
+from ...base import DatasetHandle, NDArrayHandle, BatchifyFunctionhandle
+from ...base import check_call, build_param_doc as _build_param_doc
+from ...ndarray import NDArray
+from ...ndarray import _ndarray_cls
+from ...numpy.multiarray import _np_ndarray_cls
+from ...util import is_np_array, default_array
+from ...io import io as _io
+
+
+class MXDataset(Dataset):
+    """A python wrapper a C++ dataset.
+
+    Parameters
+    ----------
+    handle : DatasetHandle, required
+        The handle to the underlying C++ Dataset.
+
+    """
+    def __init__(self, handle, **kwargs):
+        super(MXDataset, self).__init__()
+        self.handle = handle
+        self._kwargs = kwargs
+        # get dataset size
+        length = ctypes.c_uint64(0)
+        check_call(_LIB.MXDatasetGetLen(self.handle, ctypes.byref(length)))
+        self._len = length.value
+
+    def __del__(self):
+        check_call(_LIB.MXDatasetFree(self.handle))
+
+    def __len__(self):
+        return self._len
+
+    def __getitem__(self, idx):
+        orig_idx = idx
+        if idx < 0:
+            idx += self._len
+        # check bound
+        if idx < 0 or idx >= self._len:
+            raise IndexError("Index {} out of bound: (0, {})".format(orig_idx, self._len))
+        create_ndarray_fn = _np_ndarray_cls if is_np_array() else _ndarray_cls
+        output_vars = ctypes.POINTER(NDArrayHandle)()
+        num_output = ctypes.c_int(0)
+        check_call(_LIB.MXDatasetGetItems(self.handle,
+                                          ctypes.c_uint64(idx),
+                                          ctypes.byref(num_output),
+                                          ctypes.byref(output_vars)))
+        out = [create_ndarray_fn(ctypes.cast(output_vars[i], NDArrayHandle),
+                                 False) for i in range(num_output.value)]
+        for i in range(num_output.value):
+            if out[i].size == 1:
+                out[i] = out[i].asnumpy()
+        if len(out) > 1:
+            return tuple(out)
+        return out[0]
+
+
+class MXSampler(Sampler):
+    """MXNet internal sampler implemented in c++.
+
+    Parameters
+    ----------
+    name : str
+        Name of the sampler.
+
+    """
+    def __init__(self, name, **kwargs):
+        try:
+            creator = getattr(_io, name)
+        except AttributeError:
+            raise ValueError('{} is not a valid MXDataIter class'.format(name))
+        self._iter = creator(**kwargs)
+
+    def __len__(self):
+        try:
+            size = len(self._iter)
+        except TypeError:
+            raise TypeError('Iterator {} does not provide length info'.format(self._iter))
+        return size
+
+    def __iter__(self):
+        for item in self._iter:
+            ret = item.data[0].asnumpy().flatten().tolist()
+            pad = item.pad
+            if pad > 0:
+                # remove padded values
+                ret = ret[:-pad]
+            elif len(ret) == 1:
+                ret = ret[0]
+            yield ret
+        self._iter.reset()
+
+
+class MXBatchifyFunction(object):
+    """MXNet batchify function implemented in C++.
+
+    Parameters
+    ----------
+    handle : ctypes.c_void
+        Object handle.
+
+    """
+    def __init__(self, handle, **kwargs):
+        self._kwargs = kwargs
+        self.handle = handle
+
+    def __del__(self):
+        if self.handle is not None:
+            check_call(_LIB.MXBatchifyFunctionFree(self.handle))
+
+    def __getstate__(self):
+        """Override pickling behavior."""
+        # pickling pointer is not allowed
+        d = dict({'creator_name': self._kwargs['creator_name'],
+                  '_kwargs': self._kwargs})
+        return d
+
+    def __setstate__(self, d):
+        """Restore from pickled."""
+        creator = d['_kwargs']['creator_name']
+        d['_kwargs'].pop('creator_name')
+        other = getattr(sys.modules[__name__], creator)(**d['_kwargs'])
+        self.handle = other.handle
+        self._kwargs = other._kwargs
+        other.handle = None
+
+    def __call__(self, data, num_out=1):
+        if isinstance(data[0], NDArray):
+            create_ndarray_fn = _np_ndarray_cls if is_np_array() else _ndarray_cls
+            num_output = ctypes.c_int(num_out)
+            input_arrs = (NDArrayHandle * len(data))()
+            for i, d in enumerate(data):
+                input_arrs[i] = d.handle
+            input_vars = ctypes.cast(input_arrs, ctypes.POINTER(NDArrayHandle))
+            batch_size = ctypes.c_int(len(data) // num_output.value)
+            output_vars = ctypes.POINTER(NDArrayHandle)()
+            check_call(_LIB.MXBatchifyFunctionInvoke(self.handle,
+                                                     batch_size,
+                                                     num_output,
+                                                     input_vars,
+                                                     ctypes.byref(output_vars)))
+            out = [create_ndarray_fn(ctypes.cast(output_vars[i], NDArrayHandle), \
+                False) for i in range(num_output.value)]
+            if len(out) == 1:
+                out = out[0]
+            return out
+        elif isinstance(data[0], (list, tuple)):
+            return self.__call__([j for sub in data for j in sub], num_out=len(data[0]))
+        else:
+            data = [default_array(i) for i in data]
+            return self.__call__(data, num_out=num_out)
+
+def _make_internal_datasets(handle):
+    """Create an io iterator by handle."""
+    name = ctypes.c_char_p()
+    desc = ctypes.c_char_p()
+    num_args = mx_uint()
+    arg_names = ctypes.POINTER(ctypes.c_char_p)()
+    arg_types = ctypes.POINTER(ctypes.c_char_p)()
+    arg_descs = ctypes.POINTER(ctypes.c_char_p)()
+
+    check_call(_LIB.MXDatasetGetDatasetInfo( \
+            handle, ctypes.byref(name), ctypes.byref(desc), \
+            ctypes.byref(num_args), \
+            ctypes.byref(arg_names), \
+            ctypes.byref(arg_types), \
+            ctypes.byref(arg_descs)))
+    iter_name = py_str(name.value)
+
+    narg = int(num_args.value)
+    param_str = _build_param_doc(
+        [py_str(arg_names[i]) for i in range(narg)],
+        [py_str(arg_types[i]) for i in range(narg)],
+        [py_str(arg_descs[i]) for i in range(narg)])
+
+    doc_str = ('%s\n\n' +
+               '%s\n' +
+               'Returns\n' +
+               '-------\n' +
+               'MXDataset\n'+
+               '    The result dataset.')
+    doc_str = doc_str % (desc.value, param_str)
+
+    def creator(*args, **kwargs):
+        """Create a dataset.
+        The parameters listed below can be passed in as keyword arguments.
+
+        Parameters
+        ----------
+        name : string, required.
+            Name of the resulting dataset.
+
+        Returns
+        -------
+        dataset: Dataset
+            The resulting dataset.
+        """
+        param_keys = []
+        param_vals = []
+
+        for k, val in kwargs.items():
+            # convert ndarray to handle
+            if hasattr(val, 'handle'):
+                val = val.handle.value
+            if isinstance(val, (tuple, list)):
+                val = [vv.handle.value if hasattr(vv, 'handle') else vv for vv in val]
+            param_keys.append(k)
+            param_vals.append(str(val))
+        # create atomic symbol
+        param_keys = c_str_array(param_keys)
+        param_vals = c_str_array(param_vals)
+        dataset_handle = DatasetHandle()
+        check_call(_LIB.MXDatasetCreateDataset(
+            handle,
+            mx_uint(len(param_keys)),
+            param_keys, param_vals,
+            ctypes.byref(dataset_handle)))
+
+        if len(args):
+            raise TypeError('%s can only accept keyword arguments' % iter_name)
+
+        return MXDataset(dataset_handle, **kwargs)
+
+    creator.__name__ = iter_name
+    creator.__doc__ = doc_str
+    return creator
+
+def _init_internal_dataset_module():
+    """List and add all the datasets to current module."""
+    plist = ctypes.POINTER(ctypes.c_void_p)()
+    size = ctypes.c_uint()
+    check_call(_LIB.MXListDatasets(ctypes.byref(size), ctypes.byref(plist)))
+    module_obj = sys.modules[__name__]
+    for i in range(size.value):
+        hdl = ctypes.c_void_p(plist[i])
+        dataset = _make_internal_datasets(hdl)
+        setattr(module_obj, dataset.__name__, dataset)
+
+_init_internal_dataset_module()
+
+def _make_internal_batchify_functions(handle):
+    """Create an io iterator by handle."""
+    name = ctypes.c_char_p()
+    desc = ctypes.c_char_p()
+    num_args = mx_uint()
+    arg_names = ctypes.POINTER(ctypes.c_char_p)()
+    arg_types = ctypes.POINTER(ctypes.c_char_p)()
+    arg_descs = ctypes.POINTER(ctypes.c_char_p)()
+
+    check_call(_LIB.MXBatchifyFunctionGetFunctionInfo( \
+            handle, ctypes.byref(name), ctypes.byref(desc), \
+            ctypes.byref(num_args), \
+            ctypes.byref(arg_names), \
+            ctypes.byref(arg_types), \
+            ctypes.byref(arg_descs)))
+    bf_name = py_str(name.value)
+
+    narg = int(num_args.value)
+    param_str = _build_param_doc(
+        [py_str(arg_names[i]) for i in range(narg)],
+        [py_str(arg_types[i]) for i in range(narg)],
+        [py_str(arg_descs[i]) for i in range(narg)])
+
+    doc_str = ('%s\n\n' +
+               '%s\n' +
+               'Returns\n' +
+               '-------\n' +
+               'MXBatchifyFunction\n'+
+               '    The result batchify function.')
+    doc_str = doc_str % (desc.value, param_str)
+
+    def creator(*args, **kwargs):
+        """Create an iterator.
+        The parameters listed below can be passed in as keyword arguments.
+
+        Parameters
+        ----------
+        name : string, required.
+            Name of the resulting batchify function.
+
+        Returns
+        -------
+        batchify_func: BatchifyFunction
+            The resulting batchify function.
+        """
+        param_keys = []
+        param_vals = []
+
+        for k, val in kwargs.items():
+            # convert ndarray to handle
+            if hasattr(val, 'handle'):
+                val = val.handle.value
+            if isinstance(val, (tuple, list)):
+                val = [vv.handle.value if hasattr(vv, 'handle') else vv for vv in val]
+            param_keys.append(k)
+            param_vals.append(str(val))
+        # create atomic symbol
+        param_keys = c_str_array(param_keys)
+        param_vals = c_str_array(param_vals)
+        batchify_fn_handle = BatchifyFunctionhandle()
+        check_call(_LIB.MXBatchifyFunctionCreateFunction(
+            handle,
+            mx_uint(len(param_keys)),
+            param_keys, param_vals,
+            ctypes.byref(batchify_fn_handle)))
+
+        if len(args):
+            raise TypeError('%s can only accept keyword arguments' % bf_name)
+
+        return MXBatchifyFunction(batchify_fn_handle, creator_name=bf_name, **kwargs)
+
+    creator.__name__ = bf_name
+    creator.__doc__ = doc_str
+    return creator
+
+def _init_internal_batchify_function_module():
+    """List and add all the batchify_functions to current module."""
+    plist = ctypes.POINTER(ctypes.c_void_p)()
+    size = ctypes.c_uint()
+    check_call(_LIB.MXListBatchifyFunctions(ctypes.byref(size), ctypes.byref(plist)))
+    module_obj = sys.modules[__name__]
+    for i in range(size.value):
+        hdl = ctypes.c_void_p(plist[i])
+        bf = _make_internal_batchify_functions(hdl)
+        setattr(module_obj, bf.__name__, bf)
+
+_init_internal_batchify_function_module()
diff --git a/python/mxnet/gluon/data/batchify.py b/python/mxnet/gluon/data/batchify.py
new file mode 100644
index 000000000000..7140a5962881
--- /dev/null
+++ b/python/mxnet/gluon/data/batchify.py
@@ -0,0 +1,415 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable=reimported, consider-using-enumerate
+"""Batchify function."""
+import math
+import warnings
+import numpy as np
+
+from ...context import Context, cpu
+from ... import ndarray as nd
+from ... import numpy as _np
+from ...util import is_np_array
+
+class Stack(object):
+    r"""Stack the input data samples to construct the batch.
+    The N input samples must have the same shape/length and will be stacked to construct a batch.
+    Examples
+    --------
+    >>> from mxnet.gluon.data import batchify
+    >>> # Stack multiple lists
+    >>> a = [1, 2, 3, 4]
+    >>> b = [4, 5, 6, 8]
+    >>> c = [8, 9, 1, 2]
+    >>> batchify.Stack()([a, b, c])
+    [[1. 2. 3. 4.]
+     [4. 5. 6. 8.]
+     [8. 9. 1. 2.]]
+    <NDArray 3x4 @cpu(0)>
+    >>> # Stack multiple numpy.ndarrays
+    >>> import numpy as np
+    >>> a = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
+    >>> b = np.array([[5, 6, 7, 8], [1, 2, 3, 4]])
+    >>> batchify.Stack()([a, b])
+    [[[1. 2. 3. 4.]
+      [5. 6. 7. 8.]]
+     [[5. 6. 7. 8.]
+      [1. 2. 3. 4.]]]
+    <NDArray 2x2x4 @cpu(0)>
+    >>> # Stack multiple NDArrays
+    >>> import mxnet as mx
+    >>> a = nd.array([[1, 2, 3, 4], [5, 6, 7, 8]])
+    >>> b = nd.array([[5, 6, 7, 8], [1, 2, 3, 4]])
+    >>> batchify.Stack()([a, b])
+    [[[1. 2. 3. 4.]
+      [5. 6. 7. 8.]]
+     [[5. 6. 7. 8.]
+      [1. 2. 3. 4.]]]
+    <NDArray 2x2x4 @cpu(0)>
+    """
+    def __init__(self, use_shared_mem=False):
+        self._use_shared_mem = use_shared_mem
+
+    def __call__(self, data):
+        """Batchify the input data
+        Parameters
+        ----------
+        data : list
+            The input data samples
+        Returns
+        -------
+        batch_data : NDArray
+        """
+        _arr = _np if is_np_array() else nd
+        _arr_cls = _arr.ndarray if is_np_array() else _arr.NDArray
+        if isinstance(data[0], _arr_cls):
+            dtype = data[0].dtype
+            if self._use_shared_mem:
+                out = _arr.empty((len(data),) + data[0].shape, dtype=dtype,
+                                 ctx=Context('cpu_shared', 0))
+                return _arr.stack(data, out=out) if is_np_array() else _arr.stack(*data, out=out)
+            else:
+                return _arr.stack(data) if is_np_array() else _arr.stack(*data)
+        elif isinstance(data[0], (tuple, list)):
+            data = zip(*data)
+            return [self.__call__(i) for i in data]
+        else:
+            out = np.asarray(data)
+            dtype = out.dtype
+            if self._use_shared_mem:
+                return _arr.array(out, ctx=Context('cpu_shared', 0), dtype=dtype)
+            else:
+                return _arr.array(out, dtype=dtype)
+
+    def __mx_handle__(self):
+        from ._internal import StackBatchify
+        return StackBatchify()
+
+def _pad_arrs_to_max_length(arrs, pad_val, use_shared_mem, dtype, round_to=None):
+    """Inner Implementation of the Pad batchify
+    Parameters
+    ----------
+    arrs : list
+    pad_val : number
+    use_shared_mem : bool, default False
+    round_to : int
+
+    Returns
+    -------
+    ret : NDArray
+    """
+    _arr = _np if is_np_array() else nd
+    _arr_cls = _np.ndarray if is_np_array() else nd.NDArray
+    if isinstance(arrs[0], _arr_cls):
+        dtype = arrs[0].dtype if dtype is None else dtype
+        arrs = [arr.asnumpy() for arr in arrs]
+    elif not isinstance(arrs[0], np.ndarray):
+        arrs = [np.asarray(ele) for ele in arrs]
+        dtype = arrs[0][0].dtype if dtype is None else dtype
+    else:
+        dtype = arrs[0].dtype if dtype is None else dtype
+
+    ret_shape = list(arrs[0].shape)
+    for pad_axis in range(len(ret_shape)):
+        curr_lengths = [ele.shape[pad_axis] for ele in arrs]
+        max_size = max(curr_lengths)
+        if round_to is not None:
+            max_size = round_to * math.ceil(max_size / round_to)
+        ret_shape[pad_axis] = max_size
+    ret_shape = (len(arrs), ) + tuple(ret_shape)
+
+    ret = np.full(shape=ret_shape, fill_value=pad_val, dtype=dtype)
+
+    for i, arr in enumerate(arrs):
+        if arr.shape == ret_shape[1:]:
+            ret[i] = arr
+        else:
+            slices = [slice(None) for _ in range(arr.ndim)]
+            for pad_axis in range(arr.ndim):
+                slices[pad_axis] = slice(0, arr.shape[pad_axis])
+                assert slices[pad_axis].start != slices[pad_axis].stop
+            slices = [slice(i, i + 1)] + slices
+            ret[tuple(slices)] = arr
+
+
+    ctx = Context('cpu_shared', 0) if use_shared_mem else cpu()
+    ret = _arr.array(ret, ctx=ctx, dtype=dtype)
+
+    return ret
+
+
+class Pad(object):
+    """Pad the input ndarrays along the specific padding axis and stack them to get the output.
+    Input of the function will be N samples. Each sample should contain a single element that
+    can be 1) numpy.ndarray, 2) mxnet.nd.NDArray, 3) list of numbers.
+    You can set the `pad_val` to determine the padding value.
+
+    The arrays will be padded to the largest dimensions(at most 5 dimensions to pad) and then
+    stacked to form the final output.
+
+    Parameters
+    ----------
+    val : float or int, default None
+        The padding value.
+    dtype : str or numpy.dtype, default None
+        The value type of the output. If it is set to None, the input data type is used.
+    round_to : int, default None
+        If specified, the padded dimension will be rounded to be multiple of this argument.
+
+    Examples
+    --------
+    >>> from mxnet.gluon.data import batchify
+    >>> # Inputs are multiple lists
+    >>> a = [1, 2, 3, 4]
+    >>> b = [4, 5, 6]
+    >>> c = [8, 2]
+    >>> batchify.Pad()([a, b, c])
+    [[ 1  2  3  4]
+     [ 4  5  6  0]
+     [ 8  2  0  0]]
+    <NDArray 3x4 @cpu(0)>
+    >>> # Also output the lengths
+    >>> a = [1, 2, 3, 4]
+    >>> b = [4, 5, 6]
+    >>> c = [8, 2]
+    >>> # Inputs are multiple ndarrays
+    >>> import numpy as np
+    >>> a = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
+    >>> b = np.array([[5, 8], [1, 2]])
+    >>> batchify.Pad(val=-1)([a, b])
+    [[[ 1  2  3  4]
+      [ 5  6  7  8]]
+     [[ 5  8 -1 -1]
+      [ 1  2 -1 -1]]]
+    <NDArray 2x2x4 @cpu(0)>
+    >>> # Inputs are multiple NDArrays
+    >>> import mxnet as mx
+    >>> a = nd.array([[1, 2, 3, 4], [5, 6, 7, 8]])
+    >>> b = nd.array([[5, 8], [1, 2]])
+    >>> batchify.Pad(val=-1)([a, b])
+    [[[ 1.  2.  3.  4.]
+      [ 5.  6.  7.  8.]]
+     [[ 5.  8. -1. -1.]
+      [ 1.  2. -1. -1.]]]
+    <NDArray 2x2x4 @cpu(0)>
+    """
+    def __init__(self, val=None, dtype=None, round_to=None, use_shared_mem=False):
+        self._pad_val = 0 if val is None else val
+        self._dtype = dtype
+        self._warned = False
+        self._round_to = round_to
+        self._use_shared_mem = use_shared_mem
+
+    def __call__(self, data):
+        """Batchify the input data.
+
+        The input can be list of numpy.ndarray, list of numbers or list of
+        mxnet.nd.NDArray. Inputting mxnet.nd.NDArray is discouraged as each
+        array need to be converted to numpy for efficient padding.
+        The arrays will be padded to the largest dimension at `axis` and then
+        stacked to form the final output.
+
+        Parameters
+        ----------
+        data : List[np.ndarray] or List[List[dtype]] or List[nd.NDArray]
+            List of samples to pad and stack.
+        Returns
+        -------
+        batch_data: NDArray
+            Data in the minibatch. Shape is (N, ...)
+        """
+        _arr = _np if is_np_array() else nd
+        _arr_cls = _arr.ndarray if is_np_array() else _arr.NDArray
+        if isinstance(data[0], _arr_cls) and not self._warned:
+            self._warned = True
+            warnings.warn(
+                'Using Pad with NDArrays is discouraged for speed reasons. '
+                'Instead you should pad your data while it is still a list '
+                'and before converting to an NDArray. '
+                'Alternatively you can consider inputting a numpy.ndarray.')
+        if isinstance(data[0], (_arr_cls, np.ndarray, list)):
+            padded_arr = _pad_arrs_to_max_length(data, self._pad_val,
+                                                 self._use_shared_mem,
+                                                 self._dtype, self._round_to)
+            return padded_arr
+        else:
+            raise NotImplementedError(
+                "Pad() does not support multiple items, use Group(Pad(), Pad(), ...) instead")
+
+    def __mx_handle__(self):
+        from ._internal import PadBatchify
+        return PadBatchify(pad_val=self._pad_val, dtype=self._dtype if self._dtype is not None else -1)
+
+def _append_arrs(arrs, use_shared_mem=False, expand=False, batch_axis=0):
+    """Internal impl for returning appened arrays as list."""
+    _arr = _np if is_np_array() else nd
+    if isinstance(arrs[0], _arr.NDArray):
+        if use_shared_mem:
+            out = [x.as_in_context(Context('cpu_shared', 0)) for x in arrs]
+        else:
+            out = arrs
+    else:
+        if use_shared_mem:
+            out = [_arr.array(x, ctx=Context('cpu_shared', 0)) for x in arrs]
+        else:
+            out = [_arr.array(x) for x in arrs]
+
+    # add batch axis
+    if expand:
+        out = [x.expand_dims(axis=batch_axis) for x in out]
+    return out
+
+
+class Append(object):
+    r"""Loosely return list of the input data samples.
+    There is no constraint of shape for any of the input samples, however, you will
+    only be able to apply single batch operations since the output have different shapes.
+    Examples
+    --------
+    >>> a = [1, 2, 3, 4]
+    >>> b = [4, 5, 6]
+    >>> c = [8, 2]
+    >>> batchify.Append()([a, b, c])
+    [
+    [[1. 2. 3. 4.]]
+    <NDArray 1x4 @cpu_shared(0)>,
+    [[4. 5. 6.]]
+    <NDArray 1x3 @cpu_shared(0)>,
+    [[8. 2.]]
+    <NDArray 1x2 @cpu_shared(0)>
+    ]
+    """
+
+    def __init__(self, expand=True, batch_axis=0, use_shared_mem=False):
+        self._expand = expand
+        self._batch_axis = batch_axis
+        self._use_shared_mem = use_shared_mem
+
+    def __call__(self, data):
+        """Batchify the input data.
+        Parameters
+        ----------
+        data : list
+            The input data samples
+        Returns
+        -------
+        batch_data : NDArray
+        """
+        return _append_arrs(data, use_shared_mem=self._use_shared_mem,
+                            expand=self._expand, batch_axis=self._batch_axis)
+
+class Group(object):
+    """Wrap multiple batchify functions together. The input functions will be applied
+    to the corresponding input fields.
+    Each data sample should be a list or tuple containing multiple attributes. The `i`th batchify
+    function stored in `Group` will be applied on the `i`th attribute. For example, each
+    data sample is (nd_data, label). You can wrap two batchify functions using
+    `Group(DataBatchify, LabelBatchify)` to batchify nd_data and label correspondingly.
+    Parameters
+    ----------
+    fn : list or tuple or callable
+        The batchify functions to wrap.
+    *args : tuple of callable
+        The additional batchify functions to wrap.
+    Examples
+    --------
+    >>> a = ([1, 2, 3, 4], 0)
+    >>> b = ([5, 7], 1)
+    >>> c = ([1, 2, 3, 4, 5, 6, 7], 0)
+    >>> f1, f2 = Group(Pad(val=0),
+    ...                Stack())([a, b])
+    >>> f1
+    <BLANKLINE>
+    [[1. 2. 3. 4.]
+     [5. 7. 0. 0.]]
+    <NDArray 2x4 @cpu_shared(0)>
+    >>> f2
+    <BLANKLINE>
+    [0 1]
+    <NDArray 2 @cpu_shared(0)>
+    """
+    def __init__(self, fn, *args):
+        self._handle = None
+        if isinstance(fn, (list, tuple)):
+            assert len(args) == 0, 'Input pattern not understood. The input of Group can be ' \
+                                   'Group(A, B, C) or Group([A, B, C]) or Group((A, B, C)). ' \
+                                   'Received fn=%s, args=%s' % (str(fn), str(args))
+            self._fn = fn
+        else:
+            self._fn = (fn, ) + args
+        for i, ele_fn in enumerate(self._fn):
+            assert hasattr(ele_fn, '__call__'), 'Batchify functions must be callable! ' \
+                                                'type(fn[%d]) = %s' % (i, str(type(ele_fn)))
+
+    def __call__(self, data):
+        """Batchify the input data.
+        Parameters
+        ----------
+        data : list
+            The samples to batchfy. Each sample should contain N attributes.
+        Returns
+        -------
+        ret : tuple
+            A tuple of length N. Contains the batchified result of each attribute in the input.
+        """
+        assert len(data[0]) == len(self._fn),\
+            'The number of attributes in each data sample should contains' \
+            ' {} elements'.format(len(self._fn))
+        ret = []
+        for i, ele_fn in enumerate(self._fn):
+            ret.append(ele_fn([ele[i] for ele in data]))
+        return tuple(ret)
+
+    def __mx_handle__(self):
+        if self._handle  is None:
+            from ._internal import GroupBatchify
+            try:
+                mx_fn = [fn.__mx_handle__() for fn in self._fn]
+                self._handle = GroupBatchify(functions=mx_fn)
+            except Exception as e:
+                raise NotImplementedError(
+                    "GroupBatchify requires all internal batchify functions supported by backend."
+                    + str(e))
+        return self._handle
+
+class AsList(object):
+    """Simply forward the list of input data.
+    This is particularly useful when the Dataset contains textual data
+    and in conjonction with the `Group` batchify function.
+    Examples
+    --------
+    >>> a = ([1, 2, 3, 4], "I am using MXNet")
+    >>> b = ([5, 7, 2, 5], "Gluon rocks!")
+    >>> c = ([1, 2, 3, 4], "Batchification!")
+    >>> _, l = Group(Stack(), AsList())([a, b, c])
+    >>> l
+    ['I am using MXNet', 'Gluon rocks!', 'Batchification!']
+    """
+    def __call__(self, data):
+        """
+        Parameters
+        ----------
+        data : list
+            The list of samples
+        Returns
+        -------
+        ret : list
+            The input list
+        """
+        return list(data)
diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index d34148417355..3cbfa0e856c5 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -21,6 +21,8 @@
 __all__ = ['DataLoader']
 
 import pickle
+import logging
+import warnings
 import io
 import sys
 import signal
@@ -37,6 +39,7 @@
     pass
 
 from . import sampler as _sampler
+from . import batchify as _batchify
 from ... import nd, context
 from ...util import is_np_shape, is_np_array, set_np
 from ... import numpy as _mx_np  # pylint: disable=reimported
@@ -139,7 +142,6 @@ def __init__(self, *args, **kwargs):
         self._send = self._writer.send
         self._recv = self._reader.recv
 
-
 def default_batchify_fn(data):
     """Collate data into batch."""
     if isinstance(data[0], nd.NDArray):
@@ -380,9 +382,9 @@ def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
         self._num_workers = num_workers if num_workers >= 0 else 0
         if batchify_fn is None:
             if num_workers > 0:
-                self._batchify_fn = default_mp_batchify_fn
+                self._batchify_fn = _batchify.Stack(use_shared_mem=True)
             else:
-                self._batchify_fn = default_batchify_fn
+                self._batchify_fn = _batchify.Stack()
         else:
             self._batchify_fn = batchify_fn
 
@@ -535,17 +537,7 @@ class DataLoader(object):
         shuffle, sampler, and last_batch if batch_sampler is specified.
     batchify_fn : callable
         Callback function to allow users to specify how to merge samples
-        into a batch. Defaults to `default_batchify_fn`::
-
-            def default_batchify_fn(data):
-                if isinstance(data[0], nd.NDArray):
-                    return nd.stack(*data)
-                elif isinstance(data[0], tuple):
-                    data = zip(*data)
-                    return [default_batchify_fn(i) for i in data]
-                else:
-                    data = np.asarray(data)
-                    return nd.array(data, dtype=data.dtype)
+        into a batch. Defaults to `gluon.data.batchify.Stack()`::
 
     num_workers : int, default 0
         The number of multiprocessing workers to use for data preprocessing.
@@ -572,16 +564,26 @@ def default_batchify_fn(data):
         unless you are experiencing timeout and you know it's due to slow data loading.
         Sometimes full `shared_memory` will cause all workers to hang and causes timeout. In these
         cases please reduce `num_workers` or increase system `shared_memory` size instead.
+    try_nopython : bool, default is None
+        Try compile python dataloading pipeline into pure MXNet c++ implementation. The benefit is
+        potentially faster iteration, no `shared_memory` usage, and less processes managed by python.
+        The compilation is not gauranteed to support all use cases, but it will fallback to python in
+        case of failure. You can set `try_nopython` to `False` to disable auto-detection of the
+        compilation feature or leave it to `None` to allow MXNet to determine it automatically.
+        If you request `try_nopython` to `True` and the compilation fails, it will raise a warning and
+        continue with python based implementation.
+
     """
     def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
                  last_batch=None, batch_sampler=None, batchify_fn=None,
                  num_workers=0, pin_memory=False, pin_device_id=0,
-                 prefetch=None, thread_pool=False, timeout=120):
+                 prefetch=None, thread_pool=False, timeout=120, try_nopython=None):
         self._dataset = dataset
         self._pin_memory = pin_memory
         self._pin_device_id = pin_device_id
         self._thread_pool = thread_pool
         self._timeout = timeout
+        self._mx_iter = None
         assert timeout > 0, "timeout must be positive, given {}".format(timeout)
 
         if batch_sampler is None:
@@ -607,28 +609,51 @@ def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
         self._num_workers = num_workers if num_workers >= 0 else 0
         self._worker_pool = None
         self._prefetch = max(0, int(prefetch) if prefetch is not None else 2 * self._num_workers)
-        if self._num_workers > 0:
-            if self._thread_pool:
-                self._worker_pool = ThreadPool(self._num_workers,
-                                               initializer=_thread_worker_initializer,
-                                               initargs=(is_np_shape(), is_np_array()))
-            else:
-                # set ignore keyboard interupt signal before forking processes
-                original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
-                self._worker_pool = multiprocessing.Pool(
-                    self._num_workers, initializer=_worker_initializer,
-                    initargs=[self._dataset, is_np_shape(), is_np_array()])
-                # resume keyboard interupt signal in main process
-                signal.signal(signal.SIGINT, original_sigint_handler)
         if batchify_fn is None:
             if num_workers > 0:
-                self._batchify_fn = default_mp_batchify_fn
+                self._batchify_fn = _batchify.Stack(use_shared_mem=True)
             else:
-                self._batchify_fn = default_batchify_fn
+                self._batchify_fn = _batchify.Stack()
         else:
             self._batchify_fn = batchify_fn
 
+        if num_workers > 0 and (try_nopython or try_nopython is None):
+            # check for capability to use mx backend threadedLoader
+            use_mx_iter, mx_iter_args = _check_mx_loader_capability(
+                self._dataset, self._batch_sampler, self._batchify_fn)
+            if not use_mx_iter:
+                if try_nopython:
+                    warnings.warn(mx_iter_args)
+        else:
+            use_mx_iter = False
+
+        if use_mx_iter:
+            logging.info("Using MXNet backend ThreadedDataLoader with %s workers "
+                         "instead of python dataloader.", self._num_workers)
+            self._mx_iter = MXThreadedDataLoader(
+                num_workers=self._num_workers,
+                pin_memory=self._pin_memory,
+                pin_device_id=self._pin_device_id,
+                prefetch=self._prefetch, **mx_iter_args)
+        else:
+            if self._num_workers > 0:
+                if self._thread_pool:
+                    self._worker_pool = ThreadPool(self._num_workers,
+                                                   initializer=_thread_worker_initializer,
+                                                   initargs=(is_np_shape(), is_np_array()))
+                else:
+                    # set ignore keyboard interupt signal before forking processes
+                    original_sigint_handler = signal.signal(signal.SIGINT, signal.SIG_IGN)
+                    self._worker_pool = multiprocessing.Pool(
+                        self._num_workers, initializer=_worker_initializer,
+                        initargs=[self._dataset, is_np_shape(), is_np_array()])
+                    # resume keyboard interupt signal in main process
+                    signal.signal(signal.SIGINT, original_sigint_handler)
+
     def __iter__(self):
+        if self._mx_iter is not None:
+            return iter(self._mx_iter)
+
         if self._num_workers == 0:
             def same_process_iter():
                 for batch in self._batch_sampler:
@@ -655,3 +680,119 @@ def __del__(self):
             # https://bugs.python.org/issue34172
             assert isinstance(self._worker_pool, multiprocessing.pool.Pool)
             self._worker_pool.terminate()
+
+def _check_mx_loader_capability(dataset, batch_sampler, batchify_fn):
+    from ._internal import MXDataset, MXSampler
+    from ._internal import MXBatchifyFunction
+    mx_loader_args = {}
+    error_template = "MXNet backend loader compatibility: " \
+        "[dataset - {}][batchify_fn - {}][batch sampler - {}]"
+
+    # supported dataset
+    if isinstance(dataset, MXDataset):
+        mx_loader_args['dataset'] = dataset
+    elif hasattr(dataset, '__mx_handle__'):
+        try:
+            mx_loader_args['dataset'] = dataset.__mx_handle__()
+        except NotImplementedError:
+            return False, error_template.format('fail', 'unknown', 'unknown')
+    else:
+        return False, error_template.format('fail', 'unknown', 'unknown')
+
+    # supported batchify functions
+    if hasattr(batchify_fn, '__mx_handle__'):
+        mx_loader_args['batchify_fn'] = batchify_fn.__mx_handle__()
+    elif isinstance(batchify_fn, MXBatchifyFunction):
+        mx_loader_args['batchify_fn'] = batchify_fn
+    else:
+        return False, error_template.format('pass', 'fail', 'unknown')
+
+    # supported sampler
+    if isinstance(batch_sampler, _sampler.BatchSampler):
+        if isinstance(batch_sampler._sampler, _sampler.SequentialSampler):
+            mx_loader_args['batch_sampler'] = MXSampler(
+                'SequentialSampler', length=batch_sampler._sampler._length,
+                start=batch_sampler._sampler._start,
+                batch_size=batch_sampler._batch_size,
+                last_batch=batch_sampler._last_batch)
+        elif isinstance(batch_sampler._sampler, _sampler.RandomSampler):
+            mx_loader_args['batch_sampler'] = MXSampler(
+                'RandomSampler', length=batch_sampler._sampler._length,
+                batch_size=batch_sampler._batch_size,
+                last_batch=batch_sampler._last_batch)
+        else:
+            return False, error_template.format('pass', 'pass', 'fail')
+    elif isinstance(batch_sampler, MXSampler):
+        mx_loader_args['batch_sampler'] = batch_sampler
+    else:
+        return False, error_template.format('pass', 'pass', 'fail')
+    # all good
+    return True, mx_loader_args
+
+
+class MXThreadedDataLoader(object):
+    """MXNet internal C++ threaded Data Iterator in form of DataLoader
+
+    parameters
+    ----------
+    dataset : Dataset
+        Source dataset. Note that numpy and mxnet arrays can be directly used
+        as a Dataset.
+    batch_sampler : Sampler
+        A sampler that returns mini-batches.
+    batchify_fn : callable
+        Callback function to allow users to specify how to merge samples
+        into a batch. Defaults to `gluon.data.batchify.Stack()`::
+    num_workers : int, default 0
+        The number of multiprocessing workers to use for data preprocessing.
+    pin_memory : boolean, default False
+        If ``True``, the dataloader will copy NDArrays into pinned memory
+        before returning them. Copying from CPU pinned memory to GPU is faster
+        than from normal CPU memory.
+    pin_device_id : int, default 0
+        The device id to use for allocating pinned memory if pin_memory is ``True``
+    prefetch : int, default is `num_workers * 2`
+        The number of prefetching batches only works if `num_workers` > 0.
+        If `prefetch` > 0, it allow worker process to prefetch certain batches before
+        acquiring data from iterators.
+        Note that using large prefetching batch will provide smoother bootstrapping performance,
+        but will consume more shared_memory. Using smaller number may forfeit the purpose of using
+        multiple worker processes, try reduce `num_workers` in this case.
+        By default it defaults to `num_workers * 2`, maximum prefetch size is `16`.
+    """
+    def __init__(self, dataset, batch_sampler, batchify_fn,
+                 num_workers=0, pin_memory=False, pin_device_id=0,
+                 prefetch=4):
+        from ._internal import MXDataset, MXSampler, MXBatchifyFunction
+        from ...io.io import ThreadedDataLoader
+        assert isinstance(dataset, MXDataset)
+        assert isinstance(batch_sampler, MXSampler)
+        assert isinstance(batchify_fn, MXBatchifyFunction)
+        self._dataset = dataset
+        self._batch_sampler = batch_sampler
+        self._batchify_fn = batchify_fn
+        if num_workers == 0:
+            num_workers = 1  # different convention for single thread
+        if prefetch == 0:
+            prefetch = 1  # at least one buffer required
+        pin_device_id = pin_device_id if pin_memory else -1
+        ctx = 'cpu_pinned' if pin_memory else 'cpu'
+        self._iter = ThreadedDataLoader(num_workers=num_workers, dataset=dataset,
+                                        sampler=batch_sampler, batchify_fn=batchify_fn,
+                                        prefetch_buffer=prefetch, ctx=ctx,
+                                        device_id=pin_device_id)
+
+    def __iter__(self):
+        while self._iter.iter_next():
+            self._iter.first_batch = None
+            items = self._iter.getitems()
+            pad = self._iter.getpad()
+            if pad > 0:
+                items = tuple([x[:-pad] for x in items])
+            if len(items) < 2:
+                items = items[0]
+            yield items
+        self._iter.reset()
+
+    def __len__(self):
+        return len(self._iter)
diff --git a/python/mxnet/gluon/data/dataset.py b/python/mxnet/gluon/data/dataset.py
index 7f2c6342d595..c70e792ef017 100644
--- a/python/mxnet/gluon/data/dataset.py
+++ b/python/mxnet/gluon/data/dataset.py
@@ -24,6 +24,7 @@
 import os
 
 from ... import recordio, ndarray
+from ...util import default_array
 
 
 class Dataset(object):
@@ -196,6 +197,7 @@ class SimpleDataset(Dataset):
     """
     def __init__(self, data):
         self._data = data
+        self._handle = None
 
     def __len__(self):
         return len(self._data)
@@ -203,12 +205,25 @@ def __len__(self):
     def __getitem__(self, idx):
         return self._data[idx]
 
+    def __mx_handle__(self):
+        if self._handle is None:
+            import numpy as np
+            from ._internal import NDArrayDataset
+            if isinstance(self._data, (np.ndarray, ndarray.NDArray)):
+                self._handle = NDArrayDataset(arr=default_array(self._data))
+            else:
+                raise NotImplementedError(
+                    "C++ handle for general type object is not supported, "
+                    "given {}, expect np.ndarray".format(type(self._data)))
+        return self._handle
+
 
 class _LazyTransformDataset(Dataset):
     """Lazily transformed dataset."""
     def __init__(self, data, fn):
         self._data = data
         self._fn = fn
+        self.handle = None
 
     def __len__(self):
         return len(self._data)
@@ -219,6 +234,43 @@ def __getitem__(self, idx):
             return self._fn(*item)
         return self._fn(item)
 
+    def __mx_handle__(self):
+        if self.handle is None:
+            from ..block import HybridBlock
+            from ._internal import LazyTransformDataset
+            from ...base import numeric_types
+            if not hasattr(self._data, '__mx_handle__'):
+                raise NotImplementedError("{} don't support backend".format(self._data))
+            if isinstance(self._fn, HybridBlock):
+                item = self._data[0]
+                self._fn.hybridize()
+                if isinstance(item, tuple):
+                    ret = self._fn(*item)
+                    is_scalar = [int(isinstance(x, numeric_types)) for x in ret]
+                else:
+                    ret = self._fn(item)
+                    is_scalar = [int(isinstance(ret, numeric_types))]
+                cached_op = self._fn._cached_op
+                self.handle = LazyTransformDataset(cached_op=cached_op,
+                                                   dataset=self._data.__mx_handle__(),
+                                                   scalar_outputs=tuple(is_scalar))
+            elif isinstance(self._fn, _TransformFirstClosure):
+                if not isinstance(self._fn._fn, HybridBlock):
+                    raise NotImplementedError("Block not supported.")
+                item = self._data[0][0]
+                self._fn._fn.hybridize()
+                ret = self._fn._fn(item)
+                is_scalar = [int(isinstance(ret, numeric_types))]
+                cached_op = self._fn._fn._cached_op
+                self.handle = LazyTransformDataset(cached_op=cached_op,
+                                                   dataset=self._data.__mx_handle__(),
+                                                   scalar_outputs=tuple(is_scalar),
+                                                   transform_indices=(0,))
+            else:
+                raise NotImplementedError(
+                    "C++ handle Not implemented for transforms that are not hybridizable")
+        return self.handle
+
 
 class _TransformFirstClosure(object):
     """Use callable object instead of nested function, it can be pickled."""
@@ -235,6 +287,7 @@ class _FilteredDataset(Dataset):
     def __init__(self, dataset, fn):
         self._dataset = dataset
         self._indices = [i for i, sample in enumerate(dataset) if fn(sample)]
+        self.handle = None
 
     def __len__(self):
         return len(self._indices)
@@ -242,12 +295,27 @@ def __len__(self):
     def __getitem__(self, idx):
         return self._dataset[self._indices[idx]]
 
+    def __mx_handle__(self):
+        if self.handle is None:
+            from ._internal import MXDataset, IndexedDataset
+            if hasattr(self._dataset, '__mx_handle__'):
+                dataset = self._dataset.__mx_handle__()
+            elif isinstance(self._dataset, MXDataset):
+                dataset = self._dataset
+            else:
+                raise NotImplementedError('{} not supported.'.format(self._dataset))
+            self.handle = IndexedDataset(base=dataset,
+                                         indices=self._indices)
+        return self.handle
+
+
 class _SampledDataset(Dataset):
     """Dataset with elements chosen by a sampler"""
     def __init__(self, dataset, sampler):
         self._dataset = dataset
         self._sampler = sampler
         self._indices = list(iter(sampler))
+        self.handle = None
 
     def __len__(self):
         return len(self._sampler)
@@ -255,6 +323,20 @@ def __len__(self):
     def __getitem__(self, idx):
         return self._dataset[self._indices[idx]]
 
+    def __mx_handle__(self):
+        if self.handle is None:
+            from ._internal import MXDataset, IndexedDataset
+            if hasattr(self._dataset, '__mx_handle__'):
+                dataset = self._dataset.__mx_handle__()
+            elif isinstance(self._dataset, MXDataset):
+                dataset = self._dataset
+            else:
+                raise NotImplementedError('{} not supported.'.format(self._dataset))
+            self.handle = IndexedDataset(base=dataset,
+                                         indices=self._indices)
+        return self.handle
+
+
 class ArrayDataset(Dataset):
     """A dataset that combines multiple dataset-like objects, e.g.
     Datasets, lists, arrays, etc.
@@ -277,6 +359,7 @@ def __init__(self, *args):
             if isinstance(data, ndarray.NDArray) and len(data.shape) == 1:
                 data = data.asnumpy()
             self._data.append(data)
+        self.handle = None
 
     def __getitem__(self, idx):
         if len(self._data) == 1:
@@ -287,6 +370,20 @@ def __getitem__(self, idx):
     def __len__(self):
         return self._length
 
+    def __mx_handle__(self):
+        if self.handle is None:
+            from ._internal import MXDataset, NDArrayDataset, GroupDataset
+            datasets = []
+            for data in self._data:
+                if isinstance(data, MXDataset):
+                    datasets.append(data)
+                elif hasattr(data, '__mx_handle__'):
+                    datasets.append(data.__mx_handle__())
+                else:
+                    datasets.append(NDArrayDataset(arr=default_array(data)))
+            self.handle = GroupDataset(datasets=datasets)
+        return self.handle
+
 
 class RecordFileDataset(Dataset):
     """A dataset wrapping over a RecordIO (.rec) file.
@@ -309,6 +406,10 @@ def __getitem__(self, idx):
     def __len__(self):
         return len(self._record.keys)
 
+    def __mx_handle__(self):
+        from ._internal import RecordFileDataset as _RecordFileDataset
+        return _RecordFileDataset(rec_file=self.filename, idx_file=self.idx_file)
+
 
 class _DownloadedDataset(Dataset):
     """Base class for MNIST, cifar10, etc."""
@@ -322,6 +423,7 @@ def __init__(self, root, transform):
         if not os.path.isdir(root):
             os.makedirs(root)
         self._get_data()
+        self.handle = None
 
     def __getitem__(self, idx):
         if self._transform is not None:
@@ -333,3 +435,11 @@ def __len__(self):
 
     def _get_data(self):
         raise NotImplementedError
+
+    def __mx_handle__(self):
+        if self.handle is None:
+            from ._internal import NDArrayDataset, GroupDataset
+            self.handle = GroupDataset(
+                datasets=(NDArrayDataset(arr=default_array(self._data)),
+                          NDArrayDataset(arr=default_array(self._label))))
+        return self.handle
diff --git a/python/mxnet/gluon/data/vision/datasets.py b/python/mxnet/gluon/data/vision/datasets.py
index bdcaff52a042..9912a139ffa7 100644
--- a/python/mxnet/gluon/data/vision/datasets.py
+++ b/python/mxnet/gluon/data/vision/datasets.py
@@ -19,7 +19,7 @@
 # pylint: disable=
 """Dataset container."""
 __all__ = ['MNIST', 'FashionMNIST', 'CIFAR10', 'CIFAR100',
-           'ImageRecordDataset', 'ImageFolderDataset']
+           'ImageRecordDataset', 'ImageFolderDataset', 'ImageListDataset']
 
 import os
 import gzip
@@ -32,7 +32,8 @@
 from ...utils import download, check_sha1, _get_repo_file_url
 from .... import nd, image, recordio, base
 from .... import numpy as _mx_np  # pylint: disable=reimported
-from ....util import is_np_array
+from ....util import is_np_array, default_array
+from ....base import numeric_types
 
 
 class MNIST(dataset._DownloadedDataset):
@@ -260,6 +261,11 @@ def __getitem__(self, idx):
             return self._transform(image.imdecode(img, self._flag), header.label)
         return image.imdecode(img, self._flag), header.label
 
+    def __mx_handle__(self):
+        from .._internal import ImageRecordFileDataset as _ImageRecordFileDataset
+        return _ImageRecordFileDataset(rec_file=self.filename, idx_file=self.idx_file,
+                                       flag=self._flag)
+
 
 class ImageFolderDataset(dataset.Dataset):
     """A dataset for loading image files stored in a folder structure.
@@ -298,6 +304,7 @@ def __init__(self, root, flag=1, transform=None):
         self._transform = transform
         self._exts = ['.jpg', '.jpeg', '.png']
         self._list_images(self._root)
+        self._handle = None
 
     def _list_images(self, root):
         self.synsets = []
@@ -328,3 +335,107 @@ def __getitem__(self, idx):
 
     def __len__(self):
         return len(self.items)
+
+    def __mx_handle__(self):
+        if self._handle is None:
+            from .._internal import ImageSequenceDataset, NDArrayDataset, GroupDataset
+            path_sep = '|'
+            im_names = path_sep.join([x[0] for x in self.items])
+            label = default_array([x[1] for x in self.items])
+            self._handle = GroupDataset(datasets=(
+                ImageSequenceDataset(img_list=im_names, path_sep=path_sep, flag=self._flag),
+                NDArrayDataset(arr=label)))
+        return self._handle
+
+
+class ImageListDataset(dataset.Dataset):
+    """A dataset for loading image files specified by a list of entries.
+
+    like::
+
+        # if written to text file *.lst
+        0\t0\troot/car/0001.jpg
+        1\t0\troot/car/xxxa.jpg
+        2\t0\troot/car/yyyb.jpg
+        3\t1\troot/bus/123.jpg
+        4\t1\troot/bus/023.jpg
+        5\t1\troot/bus/wwww.jpg
+
+        # if as a pure list, each item is a list [imagelabel: float or list of float, imgpath]
+        [[0, root/car/0001.jpg]
+         [0, root/car/xxxa.jpg]
+         [0, root/car/yyyb.jpg]
+         [1, root/bus/123.jpg]
+         [1, root/bus/023.jpg]
+         [1, root/bus/wwww.jpg]]
+
+    Parameters
+    ----------
+    root : str
+        Path to root directory.
+    imglist : str or list
+        Specify the path of imglist file or a list directly
+    flag : {0, 1}, default 1
+        If 0, always convert loaded images to greyscale (1 channel).
+        If 1, always convert loaded images to colored (3 channels).
+
+    Attributes
+    ----------
+    items : list of tuples
+        List of all images in (filename, label) pairs.
+    """
+    def __init__(self, root='.', imglist=None, flag=1):
+        self._root = os.path.expanduser(root)
+        self._flag = flag
+        self._imglist = {}
+        self._imgkeys = []
+        self._handle = None
+        array_fn = _mx_np.array if is_np_array() else nd.array
+        if isinstance(imglist, str):
+            # read from file
+            fname = os.path.join(self._root, imglist)
+            with open(fname, 'rt') as fin:
+                for line in iter(fin.readline, ''):
+                    line = line.strip().split('\t')
+                    label = array_fn(line[1:-1])
+                    key = int(line[0])
+                    self._imglist[key] = (label, os.path.join(self._root, line[-1]))
+                    self._imgkeys.append(key)
+        elif isinstance(imglist, list):
+            index = 1
+            for img in imglist:
+                key = str(index)
+                index += 1
+                if len(img) > 2:
+                    label = array_fn(img[:-1])
+                elif isinstance(img[0], numeric_types):
+                    label = array_fn([img[0]])
+                else:
+                    label = array_fn(img[0])
+                assert isinstance(img[-1], str)
+                self._imglist[key] = (label, os.path.join(self._root, img[-1]))
+                self._imgkeys.append(key)
+        else:
+            raise ValueError(
+                "imglist must be filename or list of valid entries, given {}".format(
+                    type(imglist)))
+
+    def __getitem__(self, idx):
+        key = self._imgkeys[idx]
+        img = image.imread(self._imglist[key][1], self._flag)
+        label = self._imglist[key][0]
+        return img, label
+
+    def __len__(self):
+        return len(self._imgkeys)
+
+    def __mx_handle__(self):
+        if self._handle is None:
+            from .._internal import ImageSequenceDataset, NDArrayDataset, GroupDataset
+            path_sep = '|'
+            im_names = path_sep.join([self._imglist[x][1] for x in self._imgkeys])
+            label = default_array(np.array([self._imglist[x][0].asnumpy() for x in self._imgkeys]))
+            self._handle = GroupDataset(datasets=(
+                ImageSequenceDataset(img_list=im_names, path_sep=path_sep, flag=self._flag),
+                NDArrayDataset(arr=label)))
+        return self._handle
diff --git a/python/mxnet/gluon/data/vision/transforms/__init__.py b/python/mxnet/gluon/data/vision/transforms/__init__.py
new file mode 100644
index 000000000000..c32bac14c0e4
--- /dev/null
+++ b/python/mxnet/gluon/data/vision/transforms/__init__.py
@@ -0,0 +1,197 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# coding: utf-8
+# pylint: disable= arguments-differ, wildcard-import
+"Vision transforms."
+
+import warnings
+import random
+
+from ....block import Block, HybridBlock
+from ....nn import Sequential, HybridSequential
+from .....util import is_np_array
+
+from . image import *
+from .image import _append_return
+
+
+class Compose(Sequential):
+    """Sequentially composes multiple transforms.
+
+    Parameters
+    ----------
+    transforms : list of transform Blocks.
+        The list of transforms to be composed.
+
+
+    Inputs:
+        - **data**: input tensor with shape of the first transform Block requires.
+
+    Outputs:
+        - **out**: output tensor with shape of the last transform Block produces.
+
+    Examples
+    --------
+    >>> transformer = transforms.Compose([transforms.Resize(300),
+    ...                                   transforms.CenterCrop(256),
+    ...                                   transforms.ToTensor()])
+    >>> image = mx.nd.random.uniform(0, 255, (224, 224, 3)).astype(dtype=np.uint8)
+    >>> transformer(image)
+    <NDArray 3x256x256 @cpu(0)>
+    """
+    def __init__(self, transforms):
+        super(Compose, self).__init__()
+        transforms.append(None)
+        hybrid = []
+        for i in transforms:
+            if isinstance(i, HybridBlock):
+                hybrid.append(i)
+                continue
+            elif len(hybrid) == 1:
+                self.add(hybrid[0])
+                hybrid = []
+            elif len(hybrid) > 1:
+                hblock = HybridSequential()
+                for j in hybrid:
+                    hblock.add(j)
+                hblock.hybridize()
+                self.add(hblock)
+                hybrid = []
+
+            if i is not None:
+                self.add(i)
+
+
+class HybridCompose(HybridSequential):
+    """Sequentially composes multiple transforms. This is the Hybrid version of Compose.
+
+    Parameters
+    ----------
+    transforms : list of transform Blocks.
+        The list of transforms to be composed.
+
+
+    Inputs:
+        - **data**: input tensor with shape of the first transform Block requires.
+
+    Outputs:
+        - **out**: output tensor with shape of the last transform Block produces.
+
+    Examples
+    --------
+    >>> transformer = transforms.HybridCompose([transforms.Resize(300),
+    ...                                   transforms.CenterCrop(256),
+    ...                                   transforms.ToTensor()])
+    >>> image = mx.nd.random.uniform(0, 255, (224, 224, 3)).astype(dtype=np.uint8)
+    >>> transformer(image)
+    <NDArray 3x256x256 @cpu(0)>
+    """
+    def __init__(self, transforms):
+        super(HybridCompose, self).__init__()
+        for i in transforms:
+            if not isinstance(i, HybridBlock):
+                raise ValueError("{} is not a HybridBlock, try use `Compose` instead".format(i))
+            self.add(i)
+        self.hybridize()
+
+
+class Cast(HybridBlock):
+    """Cast inputs to a specific data type
+
+    Parameters
+    ----------
+    dtype : str, default 'float32'
+        The target data type, in string or `numpy.dtype`.
+
+
+    Inputs:
+        - **data**: input tensor with arbitrary shape and dtype.
+
+    Outputs:
+        - **out**: output tensor with the same shape as `data` and data type as dtype.
+    """
+    def __init__(self, dtype='float32'):
+        super(Cast, self).__init__()
+        self._dtype = dtype
+
+    def hybrid_forward(self, F, *args):
+        if is_np_array():
+            F = F.npx
+        return tuple([F.cast(x, self._dtype) for x in args])
+
+
+class RandomApply(Sequential):
+    """Apply a list of transformations randomly given probability
+
+    Parameters
+    ----------
+    transforms
+        List of transformations.
+    p : float
+        Probability of applying the transformations.
+
+
+    Inputs:
+        - **data**: input tensor.
+
+    Outputs:
+        - **out**: transformed image.
+    """
+
+    def __init__(self, transforms, p=0.5):
+        super(RandomApply, self).__init__()
+        self.transforms = transforms
+        self.p = p
+
+    def forward(self, x, *args):
+        if self.p < random.random():
+            return x
+        x = self.transforms(x)
+        return _append_return(x, *args)
+
+
+class HybridRandomApply(HybridSequential):
+    """Apply a list of transformations randomly given probability
+
+    Parameters
+    ----------
+    transforms
+        List of transformations which must be HybridBlocks.
+    p : float
+        Probability of applying the transformations.
+
+
+    Inputs:
+        - **data**: input tensor.
+
+    Outputs:
+        - **out**: transformed image.
+    """
+
+    def __init__(self, transforms, p=0.5):
+        super(HybridRandomApply, self).__init__()
+        assert isinstance(transforms, HybridBlock)
+        self.transforms = transforms
+        self.p = p
+
+    def hybrid_forward(self, F, x, *args):
+        if is_np_array():
+            cond = self.p < F.random.uniform(low=0, high=1, size=1)
+            return F.npx.cond(cond, x, self.transforms(x))
+        cond = self.p < F.random.uniform(low=0, high=1, shape=1)
+        return _append_return(F.contrib.cond(cond, x, self.transforms(x)), *args)
diff --git a/python/mxnet/gluon/data/vision/transforms.py b/python/mxnet/gluon/data/vision/transforms/image.py
similarity index 68%
rename from python/mxnet/gluon/data/vision/transforms.py
rename to python/mxnet/gluon/data/vision/transforms/image.py
index 0fd1c89575d7..37b2a061b0df 100644
--- a/python/mxnet/gluon/data/vision/transforms.py
+++ b/python/mxnet/gluon/data/vision/transforms/image.py
@@ -18,87 +18,28 @@
 # coding: utf-8
 # pylint: disable= arguments-differ
 "Image transforms."
-
-import random
 import numpy as np
 
-from ...block import Block, HybridBlock
-from ...nn import Sequential, HybridSequential
-from .... import image
-from ....base import numeric_types
-from ....util import is_np_array
-
-
-class Compose(Sequential):
-    """Sequentially composes multiple transforms.
-
-    Parameters
-    ----------
-    transforms : list of transform Blocks.
-        The list of transforms to be composed.
-
-
-    Inputs:
-        - **data**: input tensor with shape of the first transform Block requires.
+from ....block import Block, HybridBlock
+from ..... import image
+from .....base import numeric_types
+from .....util import is_np_array
 
-    Outputs:
-        - **out**: output tensor with shape of the last transform Block produces.
+__all__ = ['ToTensor', 'Normalize', 'Rotate', 'RandomRotation',
+           'RandomResizedCrop', 'CropResize', 'CropResize', 'RandomCrop',
+           'CenterCrop', 'Resize', 'RandomFlipLeftRight', 'RandomFlipTopBottom',
+           'RandomBrightness', 'RandomContrast', 'RandomSaturation', 'RandomHue',
+           'RandomColorJitter', 'RandomLighting', 'RandomGray']
 
-    Examples
-    --------
-    >>> transformer = transforms.Compose([transforms.Resize(300),
-    ...                                   transforms.CenterCrop(256),
-    ...                                   transforms.ToTensor()])
-    >>> image = mx.nd.random.uniform(0, 255, (224, 224, 3)).astype(dtype=np.uint8)
-    >>> transformer(image)
-    <NDArray 3x256x256 @cpu(0)>
+def _append_return(*args):
+    """Append multiple args together.
+    This allows many transform functions to bypass additional arguments.
     """
-    def __init__(self, transforms):
-        super(Compose, self).__init__()
-        transforms.append(None)
-        hybrid = []
-        for i in transforms:
-            if isinstance(i, HybridBlock):
-                hybrid.append(i)
-                continue
-            elif len(hybrid) == 1:
-                self.add(hybrid[0])
-                hybrid = []
-            elif len(hybrid) > 1:
-                hblock = HybridSequential()
-                for j in hybrid:
-                    hblock.add(j)
-                hblock.hybridize()
-                self.add(hblock)
-                hybrid = []
-
-            if i is not None:
-                self.add(i)
-
-
-class Cast(HybridBlock):
-    """Cast input to a specific data type
-
-    Parameters
-    ----------
-    dtype : str, default 'float32'
-        The target data type, in string or `numpy.dtype`.
-
-
-    Inputs:
-        - **data**: input tensor with arbitrary shape and dtype.
-
-    Outputs:
-        - **out**: output tensor with the same shape as `data` and data type as dtype.
-    """
-    def __init__(self, dtype='float32'):
-        super(Cast, self).__init__()
-        self._dtype = dtype
-
-    def hybrid_forward(self, F, x):
-        if is_np_array():
-            F = F.npx
-        return F.cast(x, self._dtype)
+    if args:
+        if len(args) == 1:
+            return args[0]
+        return tuple(args)
+    return None
 
 
 class ToTensor(HybridBlock):
@@ -139,10 +80,10 @@ class ToTensor(HybridBlock):
     def __init__(self):
         super(ToTensor, self).__init__()
 
-    def hybrid_forward(self, F, x):
+    def hybrid_forward(self, F, x, *args):
         if is_np_array():
             F = F.npx
-        return F.image.to_tensor(x)
+        return _append_return(F.image.to_tensor(x), *args)
 
 
 class Normalize(HybridBlock):
@@ -194,10 +135,10 @@ def __init__(self, mean=0.0, std=1.0):
         self._mean = mean
         self._std = std
 
-    def hybrid_forward(self, F, x):
+    def hybrid_forward(self, F, x, *args):
         if is_np_array():
             F = F.npx
-        return F.image.normalize(x, self._mean, self._std)
+        return _append_return(F.image.normalize(x, self._mean, self._std), *args)
 
 
 class Rotate(Block):
@@ -223,11 +164,11 @@ def __init__(self, rotation_degrees, zoom_in=False, zoom_out=False):
         super(Rotate, self).__init__()
         self._args = (rotation_degrees, zoom_in, zoom_out)
 
-    def forward(self, x):
-        if x.dtype is not np.float32:
+    def forward(self, x, *args):
+        if np.dtype(x.dtype) is not np.dtype(np.float32):
             raise TypeError("This transformation only supports float32. "
-                            "Consider calling it after ToTensor")
-        return image.imrotate(x, *self._args)
+                            "Consider calling it after ToTensor, given: {}".format(x.dtype))
+        return _append_return(image.imrotate(x, *self._args), *args)
 
 
 class RandomRotation(Block):
@@ -262,16 +203,16 @@ def __init__(self, angle_limits, zoom_in=False, zoom_out=False, rotate_with_prob
         self._args = (angle_limits, zoom_in, zoom_out)
         self._rotate_with_proba = rotate_with_proba
 
-    def forward(self, x):
+    def forward(self, x, *args):
         if np.random.random() > self._rotate_with_proba:
             return x
-        if x.dtype is not np.float32:
+        if np.dtype(x.dtype) is not np.dtype(np.float32):
             raise TypeError("This transformation only supports float32. "
                             "Consider calling it after ToTensor")
-        return image.random_rotate(x, *self._args)
+        return _append_return(image.random_rotate(x, *self._args), *args)
 
 
-class RandomResizedCrop(Block):
+class RandomResizedCrop(HybridBlock):
     """Crop the input image with random scale and aspect ratio.
 
     Makes a crop of the original image with random size (default: 0.08
@@ -303,10 +244,16 @@ def __init__(self, size, scale=(0.08, 1.0), ratio=(3.0/4.0, 4.0/3.0),
         super(RandomResizedCrop, self).__init__()
         if isinstance(size, numeric_types):
             size = (size, size)
-        self._args = (size, scale, ratio, interpolation)
+        if isinstance(scale, numeric_types):
+            scale = (scale, 1.0)
+        self._kwargs = {'width': size[0], 'height': size[1],
+                        'area': scale, 'ratio': ratio,
+                        'interp': interpolation, 'max_trial': 10}
 
-    def forward(self, x):
-        return image.random_size_crop(x, *self._args)[0]
+    def hybrid_forward(self, F, x, *args):
+        if is_np_array():
+            F = F.npx
+        return _append_return(F.image.random_resized_crop(x, **self._kwargs), *args)
 
 
 class CropResize(HybridBlock):
@@ -362,13 +309,73 @@ def __init__(self, x, y, width, height, size=None, interpolation=None):
         self._size = size
         self._interpolation = interpolation
 
-    def hybrid_forward(self, F, x):
-        out = F.image.crop(x, self._x, self._y, self._width, self._height)
+    def hybrid_forward(self, F, x, *args):
+        if is_np_array():
+            _image = F.npx.image
+        else:
+            _image = F.image
+        out = _image.crop(x, self._x, self._y, self._width, self._height)
         if self._size:
-            out = F.image.resize(out, self._size, False, self._interpolation)
-        return out
+            out = _image.resize(out, self._size, False, self._interpolation)
+        return _append_return(out, *args)
+
+class RandomCrop(HybridBlock):
+    """Randomly crop `src` with `size` (width, height).
+    Padding is optional.
+    Upsample result if `src` is smaller than `size`
+    .
+    Parameters
+    ----------
+    size : int or tuple of (W, H)
+        Size of the final output.
+    pad: int or tuple
+        if int, size of the zero-padding
+        if tuple, number of values padded to the edges of each axis.
+            ((before_1, after_1), ... (before_N, after_N)) unique pad widths for each axis.
+            ((before, after),) yields same before and after pad for each axis.
+            (pad,) or int is a shortcut for before = after = pad width for all axes.
+    pad_value : int
+        The value to use for padded pixels
+    interpolation : int
+        Interpolation method for resizing. By default uses bilinear
+        interpolation. See OpenCV's resize function for available choices.
+    Inputs:
+        - **data**: input tensor with (Hi x Wi x C) shape.
+    Outputs:
+        - **out**: output tensor with ((H+2*pad) x (W+2*pad) x C) shape.
+    """
 
-class CenterCrop(Block):
+    def __init__(self, size, pad=None, pad_value=0, interpolation=1):
+        super(RandomCrop, self).__init__()
+        if isinstance(size, numeric_types):
+            size = (size, size)
+        self._args = ((0, 1), (0, 1), size[0], size[1], interpolation)
+        self._pad_value = pad_value
+        if isinstance(pad, int):
+            self.nd_pad = (0, 0, 0, 0, pad, pad, pad, pad, 0, 0)  # workaround as 5D
+            self.np_pad = ((pad, pad), (pad, pad), (0, 0))
+        elif pad is not None:
+            assert len(pad) >= 4
+            self.nd_pad = tuple([0] * 4 + list(pad) + [0] * (6 - len(pad)))
+            self.np_pad = ((pad[0], pad[1]), (pad[2], pad[3]), (0, 0))
+        else:
+            self.nd_pad = pad
+            self.np_pad = pad
+
+    def hybrid_forward(self, F, x, *args):
+        if is_np_array():
+            if self.np_pad:
+                x = F.np.pad(x, pad_width=self.np_pad, mode='constant', constant_values=self._pad_value)
+            return _append_return(F.npx.image.random_crop(x, *self._args), *args)
+        else:
+            if self.nd_pad:
+                x = F.cast(F.expand_dims(F.expand_dims(x, 0), 0), 'float32')
+                x_pad = F.pad(x, pad_width=self.nd_pad, mode='constant', constant_value=self._pad_value)
+                x = F.cast(x_pad.squeeze(0).squeeze(0), 'uint8')
+            return _append_return(F.image.random_crop(x, *self._args), *args)
+
+
+class CenterCrop(HybridBlock):
     """Crops the image `src` to the given `size` by trimming on all four
     sides and preserving the center of the image. Upsamples if `src` is
     smaller than `size`.
@@ -399,10 +406,12 @@ def __init__(self, size, interpolation=1):
         super(CenterCrop, self).__init__()
         if isinstance(size, numeric_types):
             size = (size, size)
-        self._args = (size, interpolation)
+        self._args = (size[0], size[1], interpolation)
 
-    def forward(self, x):
-        return image.center_crop(x, *self._args)[0]
+    def hybrid_forward(self, F, x, *args):
+        if is_np_array():
+            F = F.npx
+        return _append_return(F.image.random_crop(x, (0.5, 0.5), (0.5, 0.5), *self._args), *args)
 
 
 class Resize(HybridBlock):
@@ -445,14 +454,14 @@ def __init__(self, size, keep_ratio=False, interpolation=1):
         self._size = size
         self._interpolation = interpolation
 
-    def hybrid_forward(self, F, x):
+    def hybrid_forward(self, F, x, *args):
         if is_np_array():
             F = F.npx
-        return F.image.resize(x, self._size, self._keep, self._interpolation)
+        return _append_return(F.image.resize(x, self._size, self._keep, self._interpolation), *args)
 
 class RandomFlipLeftRight(HybridBlock):
     """Randomly flip the input image left to right with a probability
-    of 0.5.
+    of p(0.5 by default).
 
     Inputs:
         - **data**: input tensor with (H x W x C) shape.
@@ -460,18 +469,27 @@ class RandomFlipLeftRight(HybridBlock):
     Outputs:
         - **out**: output tensor with same shape as `data`.
     """
-    def __init__(self):
+    def __init__(self, p=0.5):
         super(RandomFlipLeftRight, self).__init__()
+        self.p = p
+
+    def hybrid_forward(self, F, x, *args):
+        if self.p <= 0:
+            return _append_return(x, *args)
 
-    def hybrid_forward(self, F, x):
         if is_np_array():
-            F = F.npx
-        return F.image.random_flip_left_right(x)
+            if self.p >= 1:
+                return _append_return(F.npx.image.flip_left_right(x), *args)
+            return _append_return(F.npx.image.random_flip_left_right(x, p=self.p), *args)
+        else:
+            if self.p >= 1:
+                return _append_return(F.image.flip_left_right(x), *args)
+            return _append_return(F.image.random_flip_left_right(x, p=self.p), *args)
 
 
 class RandomFlipTopBottom(HybridBlock):
     """Randomly flip the input image top to bottom with a probability
-    of 0.5.
+    of p(0.5 by default).
 
     Inputs:
         - **data**: input tensor with (H x W x C) shape.
@@ -479,13 +497,22 @@ class RandomFlipTopBottom(HybridBlock):
     Outputs:
         - **out**: output tensor with same shape as `data`.
     """
-    def __init__(self):
+    def __init__(self, p=0.5):
         super(RandomFlipTopBottom, self).__init__()
+        self.p = p
+
+    def hybrid_forward(self, F, x, *args):
+        if self.p <= 0:
+            return _append_return(x, *args)
 
-    def hybrid_forward(self, F, x):
         if is_np_array():
-            F = F.npx
-        return F.image.random_flip_top_bottom(x)
+            if self.p >= 1:
+                return _append_return(F.npx.image.flip_top_bottom(x), *args)
+            return _append_return(F.npx.image.random_flip_top_bottom(x, p=self.p), *args)
+        else:
+            if self.p >= 1:
+                return _append_return(F.image.flip_top_bottom(x), *args)
+            return _append_return(F.image.random_flip_top_bottom(x, p=self.p), *args)
 
 
 class RandomBrightness(HybridBlock):
@@ -509,10 +536,10 @@ def __init__(self, brightness):
         super(RandomBrightness, self).__init__()
         self._args = (max(0, 1-brightness), 1+brightness)
 
-    def hybrid_forward(self, F, x):
+    def hybrid_forward(self, F, x, *args):
         if is_np_array():
             F = F.npx
-        return F.image.random_brightness(x, *self._args)
+        return _append_return(F.image.random_brightness(x, *self._args), *args)
 
 
 class RandomContrast(HybridBlock):
@@ -536,10 +563,10 @@ def __init__(self, contrast):
         super(RandomContrast, self).__init__()
         self._args = (max(0, 1-contrast), 1+contrast)
 
-    def hybrid_forward(self, F, x):
+    def hybrid_forward(self, F, x, *args):
         if is_np_array():
             F = F.npx
-        return F.image.random_contrast(x, *self._args)
+        return _append_return(F.image.random_contrast(x, *self._args), *args)
 
 
 class RandomSaturation(HybridBlock):
@@ -563,10 +590,10 @@ def __init__(self, saturation):
         super(RandomSaturation, self).__init__()
         self._args = (max(0, 1-saturation), 1+saturation)
 
-    def hybrid_forward(self, F, x):
+    def hybrid_forward(self, F, x, *args):
         if is_np_array():
             F = F.npx
-        return F.image.random_saturation(x, *self._args)
+        return _append_return(F.image.random_saturation(x, *self._args), *args)
 
 
 class RandomHue(HybridBlock):
@@ -590,10 +617,10 @@ def __init__(self, hue):
         super(RandomHue, self).__init__()
         self._args = (max(0, 1-hue), 1+hue)
 
-    def hybrid_forward(self, F, x):
+    def hybrid_forward(self, F, x, *args):
         if is_np_array():
             F = F.npx
-        return F.image.random_hue(x, *self._args)
+        return _append_return(F.image.random_hue(x, *self._args), *args)
 
 
 class RandomColorJitter(HybridBlock):
@@ -626,10 +653,10 @@ def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
         super(RandomColorJitter, self).__init__()
         self._args = (brightness, contrast, saturation, hue)
 
-    def hybrid_forward(self, F, x):
+    def hybrid_forward(self, F, x, *args):
         if is_np_array():
             F = F.npx
-        return F.image.random_color_jitter(x, *self._args)
+        return _append_return(F.image.random_color_jitter(x, *self._args), *args)
 
 
 class RandomLighting(HybridBlock):
@@ -651,37 +678,36 @@ def __init__(self, alpha):
         super(RandomLighting, self).__init__()
         self._alpha = alpha
 
-    def hybrid_forward(self, F, x):
+    def hybrid_forward(self, F, x, *args):
         if is_np_array():
             F = F.npx
-        return F.image.random_lighting(x, self._alpha)
+        return _append_return(F.image.random_lighting(x, self._alpha), *args)
 
 
-class RandomApply(Sequential):
-    """Apply a list of transformations randomly given probability
+class RandomGray(HybridBlock):
+    """Randomly convert to gray image.
 
     Parameters
     ----------
-    transforms
-        List of transformations.
     p : float
-        Probability of applying the transformations.
-
-
-    Inputs:
-        - **data**: input tensor.
-
-    Outputs:
-        - **out**: transformed image.
+        Probability to convert to grayscale
     """
-
-    def __init__(self, transforms, p=0.5):
-        super(RandomApply, self).__init__()
-        self.transforms = transforms
+    def __init__(self, p=0.5):
+        super(RandomGray, self).__init__()
         self.p = p
 
-    def forward(self, x):
-        if self.p < random.random():
-            return x
-        x = self.transforms(x)
-        return x
+    def hybrid_forward(self, F, x, *args):
+        if is_np_array():
+            mat = F.np.concatenate((F.np.full((3, 1), 0.2989),
+                                    F.np.full((3, 1), 0.5870),
+                                    F.np.full((3, 1), 0.114)), axis=1)
+            x = F.npx.cast(x, dtype='float32')
+            gray = F.np.where(self.p < F.np.random.uniform(), x, F.np.dot(x, mat))
+        else:
+            mat = F.concat(F.full((3, 1), 0.2989),
+                           F.full((3, 1), 0.5870),
+                           F.full((3, 1), 0.114), dim=1)
+            cond = self.p < F.random.uniform(shape=1)
+            x = F.cast(x, dtype='float32')
+            gray = F.contrib.cond(cond, lambda: x, lambda: F.dot(x, mat))
+        return _append_return(gray, *args)
diff --git a/python/mxnet/gluon/nn/basic_layers.py b/python/mxnet/gluon/nn/basic_layers.py
index 797392a6a36a..2011b5bf36b2 100644
--- a/python/mxnet/gluon/nn/basic_layers.py
+++ b/python/mxnet/gluon/nn/basic_layers.py
@@ -50,9 +50,15 @@ def add(self, *blocks):
         for block in blocks:
             self.register_child(block)
 
-    def forward(self, x):
+    def forward(self, x, *args):
         for block in self._children.values():
-            x = block(x)
+            x = block(x, *args)
+            args = []
+            if isinstance(x, (tuple, list)):
+                args = x[1:]
+                x = x[0]
+        if args:
+            x = tuple([x] + list(args))
         return x
 
     def __repr__(self):
@@ -114,9 +120,15 @@ def add(self, *blocks):
         for block in blocks:
             self.register_child(block)
 
-    def hybrid_forward(self, F, x):
+    def hybrid_forward(self, F, x, *args):
         for block in self._children.values():
-            x = block(x)
+            x = block(x, *args)
+            args = []
+            if isinstance(x, (tuple, list)):
+                args = x[1:]
+                x = x[0]
+        if args:
+            x = tuple([x] + list(args))
         return x
 
     def __repr__(self):
diff --git a/python/mxnet/image/image.py b/python/mxnet/image/image.py
index 8b429a4544f3..86b1cf25ae62 100644
--- a/python/mxnet/image/image.py
+++ b/python/mxnet/image/image.py
@@ -640,13 +640,13 @@ def imrotate(src, rotation_degrees, zoom_in=False, zoom_out=False):
     """
     if zoom_in and zoom_out:
         raise ValueError("`zoom_in` and `zoom_out` cannot be both True")
-    if src.dtype is not np.float32:
+    if np.dtype(src.dtype) is not np.dtype(np.float32):
         raise TypeError("Only `float32` images are supported by this function")
     # handles the case in which a single image is passed to this function
     expanded = False
     if src.ndim == 3:
         expanded = True
-        src = src.expand_dims(axis=0)
+        src = _mx_np.expand_dims(src, 0) if is_np_array() else src.expand_dims(axis=0)
         if not isinstance(rotation_degrees, Number):
             raise TypeError("When a single image is passed the rotation angle is "
                             "required to be a scalar.")
@@ -714,7 +714,11 @@ def imrotate(src, rotation_degrees, zoom_in=False, zoom_out=False):
     grid = nd.concat(w_matrix_rot.expand_dims(axis=1),
                      h_matrix_rot.expand_dims(axis=1), dim=1)
     grid = grid * globalscale
+    if is_np_array():
+        src = src.as_nd_ndarray()
     rot_img = nd.BilinearSampler(src, grid)
+    if is_np_array():
+        rot_img = rot_img.as_np_ndarray()
     if expanded:
         return rot_img[0]
     return rot_img
diff --git a/python/mxnet/io/io.py b/python/mxnet/io/io.py
index 5a022ea1c81c..8e57c1939032 100644
--- a/python/mxnet/io/io.py
+++ b/python/mxnet/io/io.py
@@ -33,7 +33,7 @@
 from ..base import check_call, build_param_doc as _build_param_doc
 from ..ndarray import NDArray
 from ..ndarray.sparse import CSRNDArray
-from ..ndarray import _ndarray_cls
+from ..util import is_np_array
 from ..ndarray import array
 from ..ndarray import concat, tile
 
@@ -822,9 +822,13 @@ class MXDataIter(DataIter):
     --------
     src/io : The underlying C++ data iterator implementation, e.g., `CSVIter`.
     """
-    def __init__(self, handle, data_name='data', label_name='softmax_label', **_):
+    def __init__(self, handle, data_name='data', label_name='softmax_label', **kwargs):
         super(MXDataIter, self).__init__()
+        from ..ndarray import _ndarray_cls
+        from ..numpy.multiarray import _np_ndarray_cls
+        self._create_ndarray_fn = _np_ndarray_cls if is_np_array() else _ndarray_cls
         self.handle = handle
+        self._kwargs = kwargs
         # debug option, used to test the speed with io effect eliminated
         self._debug_skip_load = False
 
@@ -881,12 +885,12 @@ def iter_next(self):
     def getdata(self):
         hdl = NDArrayHandle()
         check_call(_LIB.MXDataIterGetData(self.handle, ctypes.byref(hdl)))
-        return _ndarray_cls(hdl, False)
+        return self._create_ndarray_fn(hdl, False)
 
     def getlabel(self):
         hdl = NDArrayHandle()
         check_call(_LIB.MXDataIterGetLabel(self.handle, ctypes.byref(hdl)))
-        return _ndarray_cls(hdl, False)
+        return self._create_ndarray_fn(hdl, False)
 
     def getindex(self):
         index_size = ctypes.c_uint64(0)
@@ -907,6 +911,24 @@ def getpad(self):
         check_call(_LIB.MXDataIterGetPadNum(self.handle, ctypes.byref(pad)))
         return pad.value
 
+    def getitems(self):
+        output_vars = ctypes.POINTER(NDArrayHandle)()
+        num_output = ctypes.c_int(0)
+        check_call(_LIB.MXDataIterGetItems(self.handle,
+                                           ctypes.byref(num_output),
+                                           ctypes.byref(output_vars)))
+        out = [self._create_ndarray_fn(ctypes.cast(output_vars[i], NDArrayHandle),
+                                       False) for i in range(num_output.value)]
+        return tuple(out)
+
+    def __len__(self):
+        length = ctypes.c_int64(-1)
+        check_call(_LIB.MXDataIterGetLenHint(self.handle, ctypes.byref(length)))
+        if length.value < 0:
+            return 0
+        return length.value
+
+
 def _make_io_iterator(handle):
     """Create an io iterator by handle."""
     name = ctypes.c_char_p()
@@ -956,6 +978,14 @@ def creator(*args, **kwargs):
         param_vals = []
 
         for k, val in kwargs.items():
+            if iter_name == 'ThreadedDataLoader':
+                # convert ndarray to handle
+                if hasattr(val, 'handle'):
+                    val = val.handle.value
+                elif isinstance(val, (tuple, list)):
+                    val = [vv.handle.value if hasattr(vv, 'handle') else vv for vv in val]
+                elif isinstance(getattr(val, '_iter', None), MXDataIter):
+                    val = val._iter.handle.value
             param_keys.append(k)
             param_vals.append(str(val))
         # create atomic symbol
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index f889bd7729a7..824bc42ec6ea 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -2697,6 +2697,41 @@ def fabs(x, out=None, **kwargs):
     return _pure_unary_func_helper(x, _api_internal.abs, _np.abs, out=out, **kwargs)
 
 
+@set_module('mxnet.ndarray.numpy')
+@wrap_np_unary_func
+def absolute(x, out=None, **kwargs):
+    r"""
+    Calculate the absolute value element-wise.
+
+    This function returns the absolute values (positive magnitude) of the
+    data in `x`. Complex values are not handled, use `absolute` to find the
+    absolute values of complex data.
+
+    Parameters
+    ----------
+    x : ndarray or scalar
+        Input array.
+    out : ndarray or None, optional
+        A location into which the result is stored. If provided, it must have
+        a shape that the inputs broadcast to. If not provided or `None`,
+        a freshly-allocated array is returned.
+
+    Returns
+    -------
+    absolute : ndarray
+        An ndarray containing the absolute value of
+        each element in `x`. This is a scalar if `x` is a scalar.
+
+    Examples
+    --------
+    >>> np.fabs(-1)
+    1.0
+    >>> np.fabs(np.array([-1.2, 1.2]))s
+    array([ 1.2,  1.2])
+    """
+    return _pure_unary_func_helper(x, _api_internal.abs, _np.abs, out=out, **kwargs)
+
+
 @set_module('mxnet.ndarray.numpy')
 @wrap_np_unary_func
 def absolute(x, out=None, **kwargs):
@@ -4011,6 +4046,7 @@ def transpose(a, axes=None):
     return _api_internal.transpose(a, axes)
 
 
+
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.ndarray.numpy')
 def split(ary, indices_or_sections, axis=0):
@@ -7599,6 +7635,55 @@ def median(a, axis=None, out=None, overwrite_input=None, keepdims=False):
                     interpolation='midpoint', keepdims=keepdims)
 
 
+@set_module('mxnet.ndarray.numpy')
+def median(a, axis=None, out=None, overwrite_input=None, keepdims=False):
+    r"""
+    Compute the median along the specified axis.
+    Returns the median of the array elements.
+    Parameters
+    ----------
+    a : array_like
+        Input array or object that can be converted to an array.
+    axis : {int, sequence of int, None}, optional
+        Axis or axes along which the medians are computed. The default
+        is to compute the median along a flattened version of the array.
+        A sequence of axes is supported since version 1.9.0.
+    out : ndarray, optional
+        Alternative output array in which to place the result. It must
+        have the same shape and buffer length as the expected output,
+        but the type (of the output) will be cast if necessary.
+    keepdims : bool, optional
+        If this is set to True, the axes which are reduced are left
+        in the result as dimensions with size one. With this option,
+        the result will broadcast correctly against the original `arr`.
+    Returns
+    -------
+    median : ndarray
+        A new array holding the result. If the input contains integers
+        or floats smaller than ``float32``, then the output data-type is
+        ``np.float32``.  Otherwise, the data-type of the output is the
+        same as that of the input. If `out` is specified, that array is
+        returned instead.
+    See Also
+    --------
+    mean, percentile
+    Examples
+    --------
+    >>> a = np.array([[10, 7, 4], [3, 2, 1]])
+    >>> a
+    array([[10,  7,  4],
+        [ 3,  2,  1]])
+    >>> np.median(a)
+    3.5
+    >>> np.median(a, axis=0)
+    array([6.5, 4.5, 2.5])
+    >>> np.median(a, axis=1)
+    array([7.,  2.])
+    """
+    return quantile(a=a, q=0.5, axis=axis, out=out, overwrite_input=overwrite_input,
+                    interpolation='midpoint', keepdims=keepdims)
+
+
 @set_module('mxnet.ndarray.numpy')
 def quantile(a, q, axis=None, out=None, overwrite_input=None, interpolation='linear', keepdims=False): # pylint: disable=too-many-arguments
     """
@@ -8199,6 +8284,14 @@ def isnan(x, out=None, **kwargs):
     - ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the same as the expected output.
     - ``out`` param does not support scalar input case.
 
+    This function differs from the original `numpy.where
+    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.where.html>`_ in
+    the following way(s):
+
+    - If `condition` is a scalar, this operator returns x or y directly without broadcasting.
+    - If `condition` is ndarray, while both `x` and `y` are scalars,
+        the output dtype will be `float32`.
+
     Examples
     --------
     >>> np.isnan(np.nan)
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index f8cf69aaa1c4..a8353a701f15 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -5848,6 +5848,114 @@ def triu(m, k=0):
     return _mx_nd_np.triu(m, k)
 
 
+@set_module('mxnet.numpy')
+def arange(start, stop=None, step=1, dtype=None, ctx=None):
+    """Return evenly spaced values within a given interval.
+
+    Values are generated within the half-open interval ``[start, stop)``
+    (in other words, the interval including `start` but excluding `stop`).
+    For integer arguments the function is equivalent to the Python built-in
+    `range` function, but returns an ndarray rather than a list.
+
+    Parameters
+    ----------
+    n : int
+        The row dimension of the arrays for which the returned
+        indices will be valid.
+    k : int, optional
+        Diagonal offset (see `tril` for details).
+    m : int, optional
+        .. versionadded:: 1.9.0
+
+        The column dimension of the arrays for which the returned
+        arrays will be valid.
+        By default `m` is taken equal to `n`.
+
+    Returns
+    -------
+    inds : tuple of arrays
+        The indices for the triangle. The returned tuple contains two arrays,
+        each with the indices along one dimension of the array.
+
+    See also
+    --------
+    triu_indices : similar function, for upper-triangular.
+    mask_indices : generic function accepting an arbitrary mask function.
+    tril, triu
+
+    Examples
+    --------
+    Compute two different sets of indices to access 4x4 arrays, one for the
+    lower triangular part starting at the main diagonal, and one starting two
+    diagonals further right:
+
+    >>> il1 = np.tril_indices(4)
+    >>> il2 = np.tril_indices(4, 2)
+
+    Here is how they can be used with a sample array:
+
+    >>> a = np.arange(16).reshape(4, 4)
+    >>> a
+    array([[ 0,  1,  2,  3],
+           [ 4,  5,  6,  7],
+           [ 8,  9, 10, 11],
+           [12, 13, 14, 15]])
+
+    Both for indexing:
+
+    >>> a[il1]
+    array([ 0,  4,  5,  8,  9, 10, 12, 13, 14, 15])
+
+    And for assigning values:
+
+    >>> a[il1] = -1
+    >>> a
+    array([[-1,  1,  2,  3],
+           [-1, -1,  6,  7],
+           [-1, -1, -1, 11],
+           [-1, -1, -1, -1]])
+
+    These cover almost the whole array (two diagonals right of the main one):
+
+    >>> a[il2] = -10
+    >>> a
+    array([[-10, -10, -10,   3],
+           [-10, -10, -10, -10],
+           [-10, -10, -10, -10],
+           [-10, -10, -10, -10]])
+
+    """
+    if m is None:
+        m = n
+    return tuple(_mx_nd_np.tril_indices(n, k, m))
+
+
+# pylint: disable=redefined-outer-name
+@set_module('mxnet.numpy')
+def triu(m, k=0):
+    r"""
+    Upper triangle of an array.
+
+    Return a copy of a matrix with the elements below the `k`-th diagonal
+    zeroed.
+
+    Please refer to the documentation for `tril` for further details.
+
+    See Also
+    --------
+    tril : lower triangle of an array
+
+    Examples
+    --------
+    >>> np.triu(np.array([[1,2,3],[4,5,6],[7,8,9],[10,11,12]]), -1)
+    array([[ 1,  2,  3],
+           [ 4,  5,  6],
+           [ 0,  8,  9],
+           [ 0,  0, 12]])
+    """
+    return _mx_nd_np.triu(m, k)
+
+
 @set_module('mxnet.numpy')
 def arange(start, stop=None, step=1, dtype=None, ctx=None):
     """Return evenly spaced values within a given interval.
@@ -8904,6 +9012,72 @@ def roll(a, shift, axis=None):
     return _mx_nd_np.roll(a, shift, axis=axis)
 
 
+@set_module('mxnet.numpy')
+def rot90(m, k=1, axes=(0, 1)):
+    """
+    Roll array elements along a given axis.
+
+    Elements that roll beyond the last position are re-introduced at
+    the first.
+
+    Parameters
+    ----------
+    a : ndarray
+        Input array.
+    shift : int or tuple of ints
+        The number of places by which elements are shifted.  If a tuple,
+        then `axis` must be a tuple of the same size, and each of the
+        given axes is shifted by the corresponding number.  If an int
+        while `axis` is a tuple of ints, then the same value is used for
+        all given axes.
+    axis : int or tuple of ints, optional
+        Axis or axes along which elements are shifted.  By default, the
+        array is flattened before shifting, after which the original
+        shape is restored.
+
+    Returns
+    -------
+    res : ndarray
+        Output array, with the same shape as `a`.
+
+    Notes
+    -----
+    Supports rolling over multiple dimensions simultaneously.
+
+    Examples
+    --------
+    >>> x = np.arange(10)
+    >>> np.roll(x, 2)
+    array([8., 9., 0., 1., 2., 3., 4., 5., 6., 7.])
+    >>> np.roll(x, -2)
+    array([2., 3., 4., 5., 6., 7., 8., 9., 0., 1.])
+
+    >>> x2 = np.reshape(x, (2,5))
+    >>> x2
+    array([[0., 1., 2., 3., 4.],
+           [5., 6., 7., 8., 9.]])
+    >>> np.roll(x2, 1)
+    array([[9., 0., 1., 2., 3.],
+           [4., 5., 6., 7., 8.]])
+    >>> np.roll(x2, -1)
+    array([[1., 2., 3., 4., 5.],
+           [6., 7., 8., 9., 0.]])
+    >>> np.roll(x2, 1, axis=0)
+    array([[5., 6., 7., 8., 9.],
+           [0., 1., 2., 3., 4.]])
+    >>> np.roll(x2, -1, axis=0)
+    array([[5., 6., 7., 8., 9.],
+           [0., 1., 2., 3., 4.]])
+    >>> np.roll(x2, 1, axis=1)
+    array([[4., 0., 1., 2., 3.],
+           [9., 5., 6., 7., 8.]])
+    >>> np.roll(x2, -1, axis=1)
+    array([[1., 2., 3., 4., 0.],
+           [6., 7., 8., 9., 5.]])
+   """
+    return _mx_nd_np.roll(a, shift, axis=axis)
+
+
 @set_module('mxnet.numpy')
 def rot90(m, k=1, axes=(0, 1)):
     """
diff --git a/python/mxnet/util.py b/python/mxnet/util.py
index c685cf32bb12..79399709274b 100644
--- a/python/mxnet/util.py
+++ b/python/mxnet/util.py
@@ -903,3 +903,29 @@ def get_cuda_compute_capability(ctx):
         raise RuntimeError('cuDeviceComputeCapability failed with error code {}: {}'
                            .format(ret, error_str.value.decode()))
     return cc_major.value * 10 + cc_minor.value
+
+def default_array(source_array, ctx=None, dtype=None):
+    """Creates an array from any object exposing the default(nd or np) array interface.
+
+    Parameters
+    ----------
+    source_array : array_like
+        An object exposing the array interface, an object whose `__array__`
+        method returns an array, or any (nested) sequence.
+    ctx : Context, optional
+        Device context (default is the current default context).
+    dtype : str or numpy.dtype, optional
+        The data type of the output array. The default dtype is ``source_array.dtype``
+        if `source_array` is an `NDArray`, `float32` otherwise.
+
+    Returns
+    -------
+    NDArray
+        An `NDArray`(nd or np) with the same contents as the `source_array`.
+    """
+    from . import nd as _mx_nd
+    from . import np as _mx_np
+    if is_np_array():
+        return _mx_np.array(source_array, ctx=ctx, dtype=dtype)
+    else:
+        return _mx_nd.array(source_array, ctx=ctx, dtype=dtype)
diff --git a/scala-package/examples/scripts/module/mnist_mlp.sh b/scala-package/examples/scripts/module/mnist_mlp.sh
index 907552a45b46..41d019b1473b 100755
--- a/scala-package/examples/scripts/module/mnist_mlp.sh
+++ b/scala-package/examples/scripts/module/mnist_mlp.sh
@@ -1,5 +1,3 @@
-#!/bin/bash
-
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -16,6 +14,11 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Common runtime ctypes.
+Acknowledgement: This file originates from incubator-tvm
+"""
+# pylint: disable=invalid-name
+import ctypes
 
 ROOT_DIR=$(cd `dirname $0`/../../..; pwd)
 CLASSPATH=$ROOT_DIR/assembly/target/*:$ROOT_DIR/examples/target/*:$ROOT_DIR/examples/target/classes/lib/*
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 82f3b71be1b8..ea39d9ac6e5b 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1759,7 +1759,7 @@ int MXNDArrayAt64(NDArrayHandle handle,
   API_END_HANDLE_ERROR(delete ptr);
 }
 
-MXNET_DLL int MXNDArrayReshape(NDArrayHandle handle,
+int MXNDArrayReshape(NDArrayHandle handle,
                                int ndim,
                                int *dims,
                                NDArrayHandle *out) {
@@ -1795,7 +1795,7 @@ MXNET_DLL int MXNDArrayReshape(NDArrayHandle handle,
   API_END_HANDLE_ERROR(delete ptr);
 }
 
-MXNET_DLL int MXNDArrayReshape64(NDArrayHandle handle,
+int MXNDArrayReshape64(NDArrayHandle handle,
                                  int ndim,
                                  dim_t *dims,
                                  bool reverse,
@@ -2178,6 +2178,12 @@ int MXDataIterBeforeFirst(DataIterHandle handle) {
   API_END();
 }
 
+int MXDataIterGetLenHint(DataIterHandle handle, int64_t *len) {
+  API_BEGIN();
+  *len = static_cast<IIterator<DataBatch>* >(handle)->GetLenHint();
+  API_END();
+}
+
 int MXDataIterNext(DataIterHandle handle, int *out) {
   API_BEGIN();
   *out = static_cast<IIterator<DataBatch>* >(handle)->Next();
@@ -2187,11 +2193,16 @@ int MXDataIterNext(DataIterHandle handle, int *out) {
 int MXDataIterGetLabel(DataIterHandle handle, NDArrayHandle *out) {
   API_BEGIN();
   const DataBatch& db = static_cast<IIterator<DataBatch>* >(handle)->Value();
+  bool no_label = db.data.size() < 2U;
   NDArray* pndarray = new NDArray();
   // temp hack to make label 1D
   // TODO(tianjun) make label 1D when label_width=0
-  mxnet::TShape shape = db.data[1].shape();
-  if (shape.ndim() > 1 && shape[1] == 1) {
+  mxnet::TShape shape = no_label ? TShape({1, }) : db.data[1].shape();
+  if (no_label || shape.Size() < 1) {
+    // it's possible that label is not available and not required
+    // but we need to bypass the invalid copy
+    *pndarray = NDArray(TShape({1}), mxnet::Context::CPU(0));
+  } else if (shape.ndim() > 1 && shape[1] == 1) {
     *pndarray = db.data[1].Reshape(mshadow::Shape1(shape[0]));
   } else {
     *pndarray = db.data[1];
@@ -2200,6 +2211,38 @@ int MXDataIterGetLabel(DataIterHandle handle, NDArrayHandle *out) {
   API_END();
 }
 
+int MXDataIterGetItems(DataIterHandle handle, int* num_outputs, NDArrayHandle **outputs) {
+  MXAPIThreadLocalEntry<> *ret = MXAPIThreadLocalStore<>::Get();
+  API_BEGIN();
+  const DataBatch& db = static_cast<IIterator<DataBatch>* >(handle)->Value();
+  std::vector<NDArray*> ndoutputs;
+  ndoutputs.reserve(db.data.size());
+  if (*outputs == nullptr) {
+    *num_outputs = db.data.size();
+    for (int i = 0; i < *num_outputs; ++i) ndoutputs.push_back(new NDArray());
+  } else {
+    CHECK_EQ(*num_outputs, db.data.size())
+        << "MXDataIterGetItems expects " << db.data.size() << " outputs, but "
+        << *num_outputs << " was given.";
+    for (int i = 0; i < *num_outputs; ++i) {
+      ndoutputs.push_back(reinterpret_cast<NDArray*>((*outputs)[i]));
+    }
+  }
+
+  // copy outputs
+  for (int i = 0; i < *num_outputs; ++i) *ndoutputs[i] = db.data[i];
+
+  if (*outputs == nullptr) {
+    ret->ret_handles.clear();
+    ret->ret_handles.reserve(*num_outputs);
+    for (int i = 0; i < *num_outputs; ++i) {
+      ret->ret_handles.push_back(ndoutputs[i]);
+    }
+    *outputs = dmlc::BeginPtr(ret->ret_handles);
+  }
+  API_END();
+}
+
 int MXDataIterGetIndex(DataIterHandle handle, uint64_t **out_index, uint64_t *out_size) {
   API_BEGIN();
   const DataBatch& db = static_cast<IIterator<DataBatch>* >(handle)->Value();
@@ -2224,6 +2267,192 @@ int MXDataIterGetPadNum(DataIterHandle handle, int *pad) {
   API_END();
 }
 
+int MXListDatasets(uint32_t *out_size,
+                             DatasetCreator **out_array) {
+  API_BEGIN();
+  auto &vec = dmlc::Registry<DatasetReg>::List();
+  *out_size = static_cast<uint32_t>(vec.size());
+  *out_array = (DatasetCreator*)(dmlc::BeginPtr(vec));  //  NOLINT(*)
+  API_END();
+}
+
+int MXDatasetCreateDataset(DatasetCreator handle,
+                           uint32_t num_param,
+                           const char **keys,
+                           const char **vals,
+                           DatasetHandle *out) {
+  Dataset *dataset = nullptr;
+  API_BEGIN();
+  DatasetReg *e = static_cast<DatasetReg *>(handle);
+  std::vector<std::pair<std::string, std::string> > kwargs;
+  for (uint32_t i = 0; i < num_param; ++i) {
+    kwargs.push_back({std::string(keys[i]), std::string(vals[i])});
+  }
+  dataset = e->body(kwargs);
+  *out = new std::shared_ptr<Dataset>(dataset);
+  API_END_HANDLE_ERROR(delete dataset);
+}
+
+int MXDatasetGetDatasetInfo(DatasetCreator creator,
+                                      const char **name,
+                                      const char **description,
+                                      uint32_t *num_args,
+                                      const char ***arg_names,
+                                      const char ***arg_type_infos,
+                                      const char ***arg_descriptions) {
+  DatasetReg *e = static_cast<DatasetReg *>(creator);
+  return MXAPIGetFunctionRegInfo(e, name, description, num_args,
+                                 arg_names, arg_type_infos, arg_descriptions,
+                                 NULL);
+}
+
+int MXDatasetFree(DatasetHandle handle) {
+  API_BEGIN();
+  delete static_cast<std::shared_ptr<Dataset>*>(handle);
+  API_END();
+}
+
+int MXDatasetGetLen(DatasetHandle handle, uint64_t *out) {
+  API_BEGIN();
+  uint64_t len = (*static_cast<std::shared_ptr<Dataset>*>(handle))->GetLen();
+  *out = len;
+  API_END();
+}
+
+int MXDatasetGetItems(DatasetHandle handle,
+                      uint64_t index,
+                      int* num_outputs,
+                      NDArrayHandle **outputs) {
+  MXAPIThreadLocalEntry<> *ret = MXAPIThreadLocalStore<>::Get();
+  API_BEGIN();
+  std::vector<NDArray> res;
+  CHECK((*static_cast<std::shared_ptr<Dataset>*>(handle))->GetItem(index, &res))
+    << "Error getting item at index: " << index;
+  std::vector<NDArray*> ndoutputs;
+  ndoutputs.reserve(res.size());
+  if (*outputs == nullptr) {
+    *num_outputs = res.size();
+    for (int i = 0; i < *num_outputs; ++i) ndoutputs.push_back(new NDArray());
+  } else {
+    CHECK_EQ(*num_outputs, res.size())
+        << "MXDatasetGetItems expects " << res.size() << " outputs, but "
+        << *num_outputs << " was given.";
+    for (int i = 0; i < *num_outputs; ++i) {
+      ndoutputs.push_back(reinterpret_cast<NDArray*>((*outputs)[i]));
+    }
+  }
+  // copy ndarrays
+  for (int i = 0; i < *num_outputs; ++i) *(ndoutputs[i]) = res[i];
+
+  if (*outputs == nullptr) {
+    ret->ret_handles.clear();
+    ret->ret_handles.reserve(*num_outputs);
+    for (int i = 0; i < *num_outputs; ++i) {
+      ret->ret_handles.push_back(ndoutputs[i]);
+    }
+    *outputs = dmlc::BeginPtr(ret->ret_handles);
+  }
+  API_END();
+}
+
+int MXListBatchifyFunctions(uint32_t *out_size,
+                            BatchifyFunctionCreator **out_array) {
+  API_BEGIN();
+  auto &vec = dmlc::Registry<BatchifyFunctionReg>::List();
+  *out_size = static_cast<uint32_t>(vec.size());
+  *out_array = (BatchifyFunctionCreator*)(dmlc::BeginPtr(vec));  //  NOLINT(*)
+  API_END();
+}
+
+int MXBatchifyFunctionCreateFunction(BatchifyFunctionCreator handle,
+                                     uint32_t num_param,
+                                     const char **keys,
+                                     const char **vals,
+                                     BatchifyFunctionHandle *out) {
+  BatchifyFunction *bf = nullptr;
+  API_BEGIN();
+  BatchifyFunctionReg *e = static_cast<BatchifyFunctionReg *>(handle);
+  std::vector<std::pair<std::string, std::string> > kwargs;
+  for (uint32_t i = 0; i < num_param; ++i) {
+    kwargs.push_back({std::string(keys[i]), std::string(vals[i])});
+  }
+  bf = e->body(kwargs);
+  *out = new BatchifyFunctionPtr(bf);
+  API_END_HANDLE_ERROR(delete bf);
+}
+
+int MXBatchifyFunctionGetFunctionInfo(BatchifyFunctionCreator creator,
+                                      const char **name,
+                                      const char **description,
+                                      uint32_t *num_args,
+                                      const char ***arg_names,
+                                      const char ***arg_type_infos,
+                                      const char ***arg_descriptions) {
+  BatchifyFunctionReg *e = static_cast<BatchifyFunctionReg *>(creator);
+  return MXAPIGetFunctionRegInfo(e, name, description, num_args,
+                                 arg_names, arg_type_infos, arg_descriptions,
+                                 NULL);
+}
+int MXBatchifyFunctionInvoke(BatchifyFunctionHandle handle,
+                             int batch_size,
+                             int num_output,
+                             NDArrayHandle *inputs,
+                             NDArrayHandle **outputs) {
+  MXAPIThreadLocalEntry<> *ret = MXAPIThreadLocalStore<>::Get();
+  API_BEGIN();
+  CHECK_GT(batch_size, 0);
+  CHECK_GT(num_output, 0);
+  std::vector<std::vector<NDArray> > ndinputs;
+  ndinputs.reserve(batch_size);
+  int pos = 0;
+  for (int i = 0; i < batch_size; ++i) {
+    std::vector<NDArray> tmp;
+    tmp.reserve(num_output);
+    for (int j = 0; j < num_output; ++j) {
+      tmp.emplace_back(*reinterpret_cast<NDArray*>(inputs[pos++]));
+      tmp.back().WaitToRead();
+    }
+    ndinputs.emplace_back(tmp);
+  }
+  std::vector<NDArray> res;
+  CHECK((*static_cast<BatchifyFunctionPtr*>(handle))->Batchify(ndinputs, &res))
+    << "Error call batchify with " << ndinputs.size() << " inputs";
+  std::vector<NDArray*> ndoutputs;
+  ndoutputs.reserve(res.size());
+  if (*outputs == nullptr) {
+    for (int i = 0; i < num_output; ++i) ndoutputs.push_back(new NDArray());
+  } else {
+    CHECK_EQ(num_output, res.size())
+        << "MXBatchifyFunctionInvoke expects " << res.size() << " outputs, but "
+        << num_output << " was given.";
+    for (int i = 0; i < num_output; ++i) {
+      ndoutputs.push_back(reinterpret_cast<NDArray*>((*outputs)[i]));
+    }
+  }
+
+  // copy ndarrays
+  for (int i = 0; i < num_output; ++i) *(ndoutputs[i]) = res[i];
+
+  if (*outputs == nullptr) {
+    ret->ret_handles.clear();
+    ret->ret_handles.reserve(num_output);
+    for (int i = 0; i < num_output; ++i) {
+      ret->ret_handles.push_back(ndoutputs[i]);
+    }
+    *outputs = dmlc::BeginPtr(ret->ret_handles);
+  }
+  API_END();
+}
+
+int MXBatchifyFunctionFree(BatchifyFunctionHandle handle) {
+  API_BEGIN();
+  delete static_cast<BatchifyFunctionPtr*>(handle);
+  API_END();
+}
+//--------------------------------------------
+// Part 6: basic KVStore interface
+//--------------------------------------------
+
 int MXKVStoreCreate(const char *type,
                     KVStoreHandle *out) {
   API_BEGIN();
diff --git a/src/imperative/cached_op.cc b/src/imperative/cached_op.cc
index 4c29ee01441d..8a7e950fe40b 100644
--- a/src/imperative/cached_op.cc
+++ b/src/imperative/cached_op.cc
@@ -32,15 +32,9 @@ DMLC_REGISTER_PARAMETER(CachedOpConfig);
 
 constexpr uint32_t kEidNotExist = std::numeric_limits<uint32_t>::max();
 
-struct CachedOp::DynamicRuntime {
-  GraphInfo info;
-  std::vector<NDArray> buff;
-  std::vector<OpStatePtr> op_states;
-};
-
 CachedOp::CachedOp(
     const nnvm::Symbol& sym,
-    const std::vector<std::pair<std::string, std::string> >& flags) {
+    const std::vector<std::pair<std::string, std::string> >& flags) : sym_(sym), flags_(flags) {
   config_.Init(flags);
   this->dynamic_shape_checked_ = false;
 
diff --git a/src/imperative/cached_op.h b/src/imperative/cached_op.h
index 731ba2efa082..1a395574176f 100644
--- a/src/imperative/cached_op.h
+++ b/src/imperative/cached_op.h
@@ -358,6 +358,10 @@ struct CachedOpConfig : public dmlc::Parameter<CachedOpConfig> {
   }
 };
 
+namespace io {
+class LazyTransformDataset;
+}
+
 class CachedOp {
   using CachedOpMonCallback =
       std::function<void(const char *, const char *, void *)>;
@@ -523,11 +527,9 @@ class CachedOp {
       const Context& default_ctx,
       const std::vector<NDArray*>& inputs,
       const std::vector<NDArray*>& outputs);
-
-
- private:
   struct DynamicRuntime;
 
+ private:
   OpStatePtr DynamicForward(
       const Context& default_ctx,
       const std::vector<NDArray*>& inputs,
@@ -561,6 +563,16 @@ class CachedOp {
 
   std::mutex mutex_;
   std::unordered_map<Context, std::vector<OpStatePtr> > cached_op_states_;
+
+  friend class ::mxnet::io::LazyTransformDataset;
+  nnvm::Symbol sym_;
+  std::vector<std::pair<std::string, std::string> > flags_;
+};
+
+struct CachedOp::DynamicRuntime {
+  GraphInfo info;
+  std::vector<NDArray> buff;
+  std::vector<OpStatePtr> op_states;
 };
 
 using CachedOpPtr = std::shared_ptr<CachedOp>;
diff --git a/src/imperative/imperative_utils.cc b/src/imperative/imperative_utils.cc
index 5491457b188f..c8543d73074b 100644
--- a/src/imperative/imperative_utils.cc
+++ b/src/imperative/imperative_utils.cc
@@ -183,7 +183,8 @@ void NaiveRunGraph(
     bool recording,
     mxnet::ShapeVector *shapes,
     const imperative::CachedOpMonCallback& callback,
-    const bool monitor_all) {
+    const bool monitor_all,
+    const bool skip_engine) {
   for (size_t i = node_start; i < node_end; ++i) {
     const nnvm::IndexedGraph::Node& node = idx[i];
     if (node.source->op() == nullptr) {
@@ -201,8 +202,16 @@ void NaiveRunGraph(
       DispatchMode dispatch_mode = DispatchMode::kUndefined;
       SetShapeType(ctx, node.source->attrs, ndinputs, ndoutputs, &dispatch_mode);
       SetWriteInplaceReq(ndinputs, ndoutputs, &req);
-      Imperative::Get()->InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs,
+      if (skip_engine) {
+        auto new_attr = node.source->attrs;
+        CHECK(new_attr.dict.find(SKIP_ENGINE) == new_attr.dict.end());
+        new_attr.dict[SKIP_ENGINE] = SKIP_ENGINE_SET;
+        Imperative::Get()->InvokeOp(ctx, new_attr, ndinputs, ndoutputs,
                                   req, dispatch_mode, state);
+      } else {
+        Imperative::Get()->InvokeOp(ctx, node.source->attrs, ndinputs, ndoutputs,
+                                    req, dispatch_mode, state);
+      }
       for (size_t j = 0; j < ndoutputs.size(); ++j) {
         if (mxnet::op::shape_is_none(ndoutputs[j]->shape())) {
           ndoutputs[j]->WaitToRead();
diff --git a/src/imperative/imperative_utils.h b/src/imperative/imperative_utils.h
index 12546ae6e11c..d6850ad3e569 100644
--- a/src/imperative/imperative_utils.h
+++ b/src/imperative/imperative_utils.h
@@ -39,6 +39,17 @@
 namespace mxnet {
 namespace imperative {
 
+namespace {
+  static const char SKIP_ENGINE[] = "__skip_engine__";
+  static const char SKIP_ENGINE_SET[] = "__true__";
+
+  inline bool CheckIfSkipEngine(const nnvm::NodeAttrs& attrs) {
+    const auto& skip_engine_attr = attrs.dict.find(SKIP_ENGINE);
+    if (skip_engine_attr == attrs.dict.end()) return false;
+    return (*skip_engine_attr).second == SKIP_ENGINE_SET;
+  }
+}
+
 struct MemoryPlanInfo {
   int storage_id;
   uint32_t root;
@@ -456,41 +467,47 @@ inline void PushFCompute(const FCompute& fn,
   CHECK(exec_type == ExecType::kSync);
   std::vector<NDArray> inputs, outputs;
   DerefInputOutput(p_inputs, p_outputs, &inputs, &outputs);
-  Engine::Get()->PushSync(
-    [=](RunContext rctx) {
-      std::vector<TBlob> input_blobs, output_blobs;
-      // pre-fcompute and post-fcompute storage fallback src NDArrays and dst NDArrays
-      std::vector<NDArray> pre_temp_src, pre_temp_dst, post_temp_dst, post_temp_src;
-      // mapping from index in input_blobs to index in pre_temp_dst
-      std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
+  const auto& run = [=](RunContext rctx) {
+    std::vector<TBlob> input_blobs, output_blobs;
+    // pre-fcompute and post-fcompute storage fallback src NDArrays and dst NDArrays
+    std::vector<NDArray> pre_temp_src, pre_temp_dst, post_temp_dst, post_temp_src;
+    // mapping from index in input_blobs to index in pre_temp_dst
+    std::unordered_map<uint32_t, uint32_t> in_temp_idx_map;
 #if MXNET_USE_MKLDNN == 1
-      if (exec_type != ExecType::kCrossDeviceCopy) {
-        // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in
-        // its FCcomputeEx, but AsyncPush the copy operation to engine.
-        // So for the case that A is holding mkldnn memory, and then copy A to B, and then copy B
-        // back to A, we shouldn't invalidate outputs for copying B back to A, because at this time,
-        // copying A to B may not happen, and will corrupt A's memory.
-        InvalidateOutputs(outputs, req);
-      }
+    if (exec_type != ExecType::kCrossDeviceCopy) {
+      // kCrossDeviceCopy is used for `_copy_to` operator, which doesn't compute immediately in
+      // its FCcomputeEx, but AsyncPush the copy operation to engine.
+      // So for the case that A is holding mkldnn memory, and then copy A to B, and then copy B
+      // back to A, we shouldn't invalidate outputs for copying B back to A, because at this time,
+      // copying A to B may not happen, and will corrupt A's memory.
+      InvalidateOutputs(outputs, req);
+    }
 #endif
-      std::vector<OpReqType> tmp_req = req;
-      // setup blobs
-      SetupDefaultBlobsInOut(inputs, outputs, nullptr, nullptr, &tmp_req,
-                             &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst,
-                             &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx);
-      // setup context
-      OpContext opctx{need_grad, is_train, rctx, engine::CallbackOnComplete(), requested};
-      bool is_gpu = ctx.dev_mask() == gpu::kDevMask;
-      // pre-fcompute fallback, cast to default storage type
-      CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx, is_gpu);
-      fn(attrs, opctx, input_blobs, tmp_req, output_blobs);
-      // post-fcompute fallback, cast to original storage type
-      CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx, is_gpu);
-      if (is_gpu && !rctx.is_bulk) {
-        rctx.get_stream<gpu>()->Wait();
-      }
-    }, ctx, read_vars, write_vars, FnProperty::kNormal,
+    std::vector<OpReqType> tmp_req = req;
+    // setup blobs
+    SetupDefaultBlobsInOut(inputs, outputs, nullptr, nullptr, &tmp_req,
+                            &input_blobs, &output_blobs, &pre_temp_src, &pre_temp_dst,
+                            &post_temp_src, &post_temp_dst, &in_temp_idx_map, mutate_idx);
+    // setup context
+    OpContext opctx{need_grad, is_train, rctx, engine::CallbackOnComplete(), requested};
+    bool is_gpu = ctx.dev_mask() == gpu::kDevMask;
+    // pre-fcompute fallback, cast to default storage type
+    CastNonDefaultStorage(pre_temp_src, pre_temp_dst, opctx, is_gpu);
+    fn(attrs, opctx, input_blobs, tmp_req, output_blobs);
+    // post-fcompute fallback, cast to original storage type
+    CastNonDefaultStorage(post_temp_src, post_temp_dst, opctx, is_gpu);
+    if (is_gpu && !rctx.is_bulk) {
+      rctx.get_stream<gpu>()->Wait();
+    }
+  };
+  if (CheckIfSkipEngine(attrs)) {
+    // execute without engine
+    run(RunContext{ctx, nullptr, nullptr, false});
+  } else {
+    Engine::Get()->PushSync(
+    run, ctx, read_vars, write_vars, FnProperty::kNormal,
     0, op->name.c_str());
+  }
 }
 
 inline void PushFComputeEx(const FComputeEx& fn,
@@ -537,8 +554,7 @@ inline void PushFComputeEx(const FComputeEx& fn,
         rctx.get_stream<gpu>()->Wait();
       }
     };
-
-  if (exec_type == ExecType::kCrossDeviceCopy) {
+  if (exec_type == ExecType::kCrossDeviceCopy || CheckIfSkipEngine(attrs)) {
     run(RunContext{ctx, nullptr, nullptr, false});
   } else {
     CHECK(exec_type == ExecType::kSync);
@@ -605,7 +621,7 @@ inline void PushOperator(const OpStatePtr& state,
 
     // For operators with subgraphs, we need to invoke them in the main thread
     // instead of the threaded engine.
-    if (exec_type == ExecType::kSubgraphExec) {
+    if (exec_type == ExecType::kSubgraphExec || CheckIfSkipEngine(attrs)) {
       RunContext rctx{ctx, nullptr, nullptr, false};
       run(rctx, engine::CallbackOnComplete());
     } else if (exec_type == ExecType::kSync) {
@@ -660,7 +676,7 @@ inline void PushOperator(const OpStatePtr& state,
         }
       };
 
-    if (exec_type == ExecType::kSubgraphExec) {
+    if (exec_type == ExecType::kSubgraphExec || CheckIfSkipEngine(attrs)) {
       RunContext rctx{ctx, nullptr, nullptr, false};
       run(rctx, engine::CallbackOnComplete());
     } else if (exec_type == ExecType::kSync) {
@@ -1165,7 +1181,8 @@ void NaiveRunGraph(const bool retain_graph,
                    bool recording,
                    mxnet::ShapeVector *shapes,
                    const CachedOpMonCallback& callback = nullptr,
-                   const bool monitor_all_ = false);
+                   const bool monitor_all_ = false,
+                   const bool skip_engine = false);
 
 }  // namespace imperative
 }  // namespace mxnet
diff --git a/src/imperative/naive_cached_op.cc b/src/imperative/naive_cached_op.cc
new file mode 100644
index 000000000000..6138ce89cd26
--- /dev/null
+++ b/src/imperative/naive_cached_op.cc
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#include <unordered_set>
+#include <iostream>
+#include "./imperative_utils.h"
+#include "./naive_cached_op.h"
+#include "../executor/exec_pass.h"
+#include "../profiler/profiler.h"
+#include "../operator/operator_common.h"
+#include "../operator/subgraph/common.h"
+
+
+namespace mxnet {
+OpStatePtr NaiveCachedOp::Forward(
+    const std::shared_ptr<CachedOp>& op_ptr,
+    const std::vector<NDArray*>& inputs,
+    const std::vector<NDArray*>& outputs) {
+
+  CHECK_EQ(inputs.size(), num_inputs());
+
+  Context default_ctx = inputs[0]->ctx();
+  {
+    auto state_ptr = GetCachedOpState(default_ctx);
+    auto& state = state_ptr.get_state<CachedOpState>();
+
+    const auto& idx = state.info.fwd_graph.indexed_graph();
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      CHECK_EQ(inputs[i]->ctx(), default_ctx)
+          << "CachedOp requires all inputs to live on the same context. But "
+          << idx[idx.input_nodes()[0]].source->attrs.name
+          << " is on " << default_ctx << " while "
+          << idx[idx.input_nodes()[i]].source->attrs.name
+          << " is on " << inputs[i]->ctx();
+    }
+  }
+
+  OpStatePtr op_state;
+  try {
+    // Initialize
+    bool recording = false;
+    op_state = OpStatePtr::Create<DynamicRuntime>();
+    auto& runtime = op_state.get_state<DynamicRuntime>();
+    {
+      auto state_ptr = GetCachedOpState(default_ctx);
+      auto& state = state_ptr.get_state<CachedOpState>();
+      std::lock_guard<std::mutex> lock(state.mutex);
+      SetForwardGraph(&state.info, recording, inputs);
+      runtime.info.fwd_graph = state.info.fwd_graph;
+    }
+    nnvm::Graph& g = runtime.info.fwd_graph;
+    const auto& idx = g.indexed_graph();
+    auto& buff = runtime.buff;
+    auto& states = runtime.op_states;
+
+    // Allocate entries
+    buff.resize(idx.num_node_entries());
+    states.resize(idx.num_nodes());
+    std::vector<NDArray*> arrays;
+    arrays.reserve(buff.size());
+    for (auto& buffered_array : buff) {
+      arrays.push_back(&buffered_array);
+    }
+    std::vector<OpReqType> array_reqs(arrays.size(), kWriteTo);
+    const auto& dispatch_modes = g.GetAttr<DispatchModeVector>("dispatch_mode");
+    const std::string& graph_type = recording ? FULL : FORWARD;
+    std::vector<uint32_t> ref_count =
+      g.GetAttr<std::vector<uint32_t> >(AddPrefix(graph_type, REF_COUNT));
+    for (size_t i = 0; i < idx.num_node_entries(); ++i) {
+      if (ref_count[i] == 0) array_reqs[i] = kNullOp;
+    }
+    CollectInputOutputNDRefs(g, inputs, outputs, &arrays);
+
+    mxnet::ShapeVector shapes = g.GetAttr<mxnet::ShapeVector>("shape");
+    imperative::NaiveRunGraph(false, default_ctx, idx, arrays, 0, idx.num_nodes(),
+                  std::move(array_reqs), std::move(ref_count), &states,
+                  dispatch_modes, false, &shapes, nullptr, false, true);
+    {
+      auto state_ptr = GetCachedOpState(default_ctx);
+      auto& state = state_ptr.get_state<CachedOpState>();
+      auto copied_shape = shapes;
+      std::lock_guard<std::mutex> lock(state.mutex);
+      state.info.fwd_graph.attrs["shape"] = std::make_shared<dmlc::any>(std::move(copied_shape));
+    }
+    g.attrs["shape"] = std::make_shared<dmlc::any>(std::move(shapes));
+  } catch (const dmlc::Error& e) {
+    throw e;
+  }
+  return op_state;
+}
+
+
+}  // namespace mxnet
diff --git a/src/imperative/naive_cached_op.h b/src/imperative/naive_cached_op.h
new file mode 100644
index 000000000000..268c561c3415
--- /dev/null
+++ b/src/imperative/naive_cached_op.h
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// Threadsafe and minimal functionality cached op version for Inference
+// lot of code reused from cached_op.h
+#ifndef MXNET_IMPERATIVE_NAIVE_CACHED_OP_H_
+#define MXNET_IMPERATIVE_NAIVE_CACHED_OP_H_
+
+#include <mxnet/imperative.h>
+#include <vector>
+#include <atomic>
+#include <utility>
+#include <string>
+#include <unordered_map>
+#include "./cached_op.h"
+
+
+
+namespace mxnet {
+/*! \brief NaiveCachedOp which does not involve engine which is useful when executed in parallel.
+    It does not support advanced features of CachedOp, including backward/recording, etc...
+ */
+class NaiveCachedOp : public CachedOp {
+ public:
+  NaiveCachedOp(
+      const nnvm::Symbol &sym,
+      const std::vector<std::pair<std::string, std::string>> &flags) : CachedOp(sym, flags) {}
+  virtual ~NaiveCachedOp() {}
+  OpStatePtr Forward(
+      const std::shared_ptr<CachedOp>& op_ptr,
+      const std::vector<NDArray*>& inputs,
+      const std::vector<NDArray*>& outputs) override;
+  void Backward(
+      const bool retain_graph,
+      const OpStatePtr& state,
+      const std::vector<NDArray*>& inputs,
+      const std::vector<OpReqType>& reqs,
+      const std::vector<NDArray*>& outputs) override {
+          LOG(FATAL) << "Backward is not supported in NaiveCachedOp.";
+      }
+  // backward storage type inference
+  bool BackwardStorageType(
+      const nnvm::NodeAttrs& attrs,
+      const int dev_mask,
+      DispatchMode* dispatch_mode,
+      std::vector<int> *in_attrs,
+      std::vector<int> *out_attrs) override {
+          LOG(FATAL) << "Backward is not supported in NaiveCachedOp.";
+          return false;
+      }
+};  // NaiveCachedOp
+
+using NaiveCachedOpPtr = std::shared_ptr<NaiveCachedOp>;
+
+}  // namespace mxnet
+#endif  // MXNET_IMPERATIVE_NAIVE_CACHED_OP_H_
diff --git a/src/io/batchify.cc b/src/io/batchify.cc
new file mode 100644
index 000000000000..ed61c742245e
--- /dev/null
+++ b/src/io/batchify.cc
@@ -0,0 +1,397 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file batchify.cc
+ * \brief Mini-batch data combination functions.
+ */
+#include <dmlc/parameter.h>
+#include <dmlc/omp.h>
+#include <mxnet/io.h>
+#include <mshadow/tensor.h>
+#include <mshadow/extension.h>
+#include <mshadow/extension/slice.h>
+
+#include <stack>
+#include <cmath>
+
+#include "./inst_vector.h"
+#include "../ndarray/ndarray_function.h"
+
+namespace mxnet {
+namespace io {
+struct GroupBatchifyParam : public dmlc::Parameter<GroupBatchifyParam> {
+  mxnet::Tuple<std::intptr_t> functions;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(GroupBatchifyParam) {
+      DMLC_DECLARE_FIELD(functions)
+          .describe("Internal sequentially applied batchify functions. "
+                    "The number of functions must match output of dataset items.");
+  }
+};  // struct GroupBatchifyParam
+DMLC_REGISTER_PARAMETER(GroupBatchifyParam);
+
+class GroupBatchify : public BatchifyFunction {
+ public:
+  explicit GroupBatchify(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.InitAllowUnknown(kwargs);
+    fs_.reserve(param_.functions.ndim());
+    for (int i = 0; i < param_.functions.ndim(); ++i) {
+        fs_.emplace_back(*static_cast<BatchifyFunctionPtr*>(
+            reinterpret_cast<void*>(param_.functions[i])));
+    }
+  }
+
+  virtual bool Batchify(const std::vector<std::vector<NDArray> >& inputs,
+                        std::vector<NDArray>* outputs) {
+    auto bs = inputs.size();
+    CHECK_GT(bs, 0) << "BatchifyFunction should handle at lease 1 sample";
+    auto out_size = inputs[0].size();
+    CHECK_EQ(out_size, fs_.size()) << "In GroupBatchifyFunction, Elem size "
+      << out_size << " and batchify function size " << fs_.size() << " must match";
+    outputs->resize(out_size);
+    for (size_t i = 0; i < out_size; ++i) {
+      std::vector<std::vector<NDArray> > inp;
+      inp.reserve(inputs.size());
+      for (size_t j = 0; j < inputs.size(); ++j) {
+          std::vector<NDArray> curr({inputs[j][i]});
+          inp.emplace_back(curr);
+      }
+      std::vector<NDArray> tmp;
+      if (!fs_[i]->Batchify(inp, &tmp)) return false;
+      (*outputs)[i] = tmp[0];
+    }
+    return true;
+  }
+
+ private:
+  /*! \brief params */
+  GroupBatchifyParam param_;
+  /*! \brief internal batchify function pointers */
+  std::vector<BatchifyFunctionPtr> fs_;
+};  // class GroupBatchify
+
+MXNET_REGISTER_IO_BATCHIFY_FUNCTION(GroupBatchify)
+  .describe(R"code(Returns the GroupBatchify function.
+    )code" ADD_FILELINE)
+  .add_arguments(GroupBatchifyParam::__FIELDS__())
+  .set_body([](const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    return new GroupBatchify(kwargs);
+});
+
+struct StackBatchifyParam : public dmlc::Parameter<StackBatchifyParam> {
+  /*! \brief Length of the sequence. */
+  int use_shared_mem;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(StackBatchifyParam) {
+      DMLC_DECLARE_FIELD(use_shared_mem).set_default(0)
+          .describe("If 1, use shared memory.");
+  }
+};  // struct StackBatchifyParam
+
+DMLC_REGISTER_PARAMETER(StackBatchifyParam);
+
+class StackBatchify : public BatchifyFunction {
+ public:
+  explicit StackBatchify(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.InitAllowUnknown(kwargs);
+  }
+
+  virtual bool Batchify(const std::vector<std::vector<NDArray> >& inputs,
+                        std::vector<NDArray>* outputs) {
+    auto out_size = SanityCheck(inputs);
+    auto bs = inputs.size();
+    outputs->resize(out_size);
+    for (size_t i = 0; i < out_size; ++i) {
+        // Process i-th output
+        mxnet::TShape ashape = inputs[0][i].shape();
+        CHECK_GE(ashape.ndim(), 0) << "Data dim must be larger than 0";
+        // check if all shapes are same
+        for (size_t j = 1; j < bs; ++j) {
+            CHECK_EQ(ashape, inputs[j][i].shape())
+              << "StackBatchify requires all data along batch dim to be the same, "
+              << "mismatch " << ashape << " vs. " << inputs[j][i].shape();
+        }
+
+        // calculate output ndarray size
+        TShape sshape(ashape.ndim() + 1, 0);
+        sshape[0] = bs;
+        for (int k = 0; k < ashape.ndim(); ++k) {
+          sshape[k + 1] = ashape[k];
+        }
+
+        int dtype = inputs[0][i].dtype();
+        if (!(*outputs)[i].is_none() && (*outputs)[i].ctx() == mxnet::Context::CPU(0) &&
+            (*outputs)[i].dtype() == dtype &&
+            (*outputs)[i].storage_type() == kDefaultStorage) {
+          if ((*outputs)[i].shape() != sshape) {
+            // realloc
+            (*outputs)[i].ReshapeAndAlloc(sshape);
+          }
+        } else {
+          (*outputs)[i] = NDArray(sshape, mxnet::Context::CPU(0), false, inputs[0][i].dtype());
+        }
+        MSHADOW_TYPE_SWITCH_WITH_BOOL(dtype, DType, {
+          _Pragma("omp parallel for num_threads(bs)")
+          for (size_t j = 0; j < bs; ++j) {
+            omp_exc_.Run([&] {
+              // inputs[j][i].WaitToRead();
+              DType *ptr = (*outputs)[i].data().dptr<DType>();
+              auto asize = ashape.Size();
+              RunContext rctx{(*outputs)[i].ctx(), nullptr, nullptr, false};
+              auto dst = TBlob(
+                ptr + asize * j, inputs[j][i].data().shape_, cpu::kDevMask, dtype, 0);
+              mxnet::ndarray::Copy<cpu, cpu>(
+                inputs[j][i].data(), &dst, Context::CPU(), Context::CPU(), rctx);
+            });
+          }
+          omp_exc_.Rethrow();
+        })
+    }
+    return true;
+  }
+ private:
+  /*! \brief parameters */
+  StackBatchifyParam param_;
+  /*! \brief OMPException obj to store and rethrow exceptions from omp blocks*/
+  dmlc::OMPException omp_exc_;
+
+  std::size_t SanityCheck(const std::vector<std::vector<NDArray> >& inputs) {
+    auto bs = inputs.size();
+    CHECK_GT(bs, 0) << "BatchifyFunction should handle at lease 1 sample";
+    auto out_size = inputs[0].size();
+    // sanity check: each input has same size
+    for (size_t i = 1; i < bs; ++i) {
+        CHECK_EQ(inputs[i].size(), out_size)
+          << i << "-th input size does not match " << out_size;
+    }
+    return out_size;
+  }
+};  // class StackBatchify
+
+MXNET_REGISTER_IO_BATCHIFY_FUNCTION(StackBatchify)
+  .describe(R"code(Returns the StackBatchify function.
+    )code" ADD_FILELINE)
+  .add_arguments(StackBatchifyParam::__FIELDS__())
+  .set_body([](const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    return new StackBatchify(kwargs);
+});
+
+struct PadBatchifyParam : public dmlc::Parameter<PadBatchifyParam> {
+  int use_shared_mem;
+  double pad_val;
+  int dtype;
+  int round_to;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(PadBatchifyParam) {
+      DMLC_DECLARE_FIELD(use_shared_mem).set_default(0)
+          .describe("If 1, use shared memory.");
+      DMLC_DECLARE_FIELD(pad_val).set_default(0)
+          .describe("The filled values, default to 0.");
+      DMLC_DECLARE_FIELD(dtype).set_default(-1)
+        .describe("If not -1, force to use dtype as output type, otherwise use input type.");
+      DMLC_DECLARE_FIELD(round_to).set_default(-1)
+        .describe("If > 0, the padded dimension will be rounded to be multiple of this value.");
+  }
+};  // struct PadBatchifyParam
+
+DMLC_REGISTER_PARAMETER(PadBatchifyParam);
+
+class PadBatchify : public BatchifyFunction {
+ public:
+  explicit PadBatchify(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.InitAllowUnknown(kwargs);
+  }
+
+  virtual bool Batchify(const std::vector<std::vector<NDArray> >& inputs,
+                        std::vector<NDArray>* outputs) {
+    auto bs = inputs.size();
+    CHECK_GT(bs, 0) << "BatchifyFunction should handle at lease 1 sample";
+    auto out_size = inputs[0].size();
+    outputs->resize(out_size);
+    for (size_t i = 0; i < out_size; ++i) {
+        // Process i-th output
+        mxnet::TShape ashape = inputs[0][i].shape();
+        CHECK_GE(ashape.ndim(), 0) << "Data dim must be larger than 0";
+        // find the maximum size in each dim
+        for (size_t j = 1; j < bs; ++j) {
+          mxnet::TShape other_shape = inputs[j][i].shape();
+          CHECK_EQ(ashape.ndim(), other_shape.ndim())
+            << "PadBatchify expects all inputs to have same dimensionality: given "
+            << ashape.ndim() << " vs. " << other_shape.ndim();
+            for (dim_t k = 0; k < ashape.ndim(); ++k) {
+              ashape[k] = std::max(ashape[k], other_shape[k]);
+            }
+        }
+        for (dim_t k = 0; k < ashape.ndim(); ++k) {
+          // pad to multiple of round_to
+          if (param_.round_to > 0) {
+            ashape[k] = param_.round_to * static_cast<int>(
+              std::ceil(static_cast<double>(ashape[k] / param_.round_to)));
+          }
+        }
+
+        // calculate output ndarray size
+        TShape sshape(ashape.ndim() + 1, 0);
+        sshape[0] = bs;
+        for (int k = 0; k < ashape.ndim(); ++k) {
+          sshape[k + 1] = ashape[k];
+        }
+
+        int dtype = param_.dtype > -1 ? param_.dtype : inputs[0][i].dtype();
+        if (!(*outputs)[i].is_none() &&
+            (*outputs)[i].ctx() == mxnet::Context::CPU(0) &&
+            (*outputs)[i].dtype() == dtype &&
+            (*outputs)[i].storage_type() == kDefaultStorage) {
+          if ((*outputs)[i].shape() != sshape) {
+            // realloc
+            (*outputs)[i].ReshapeAndAlloc(sshape);
+          }
+        } else {
+          (*outputs)[i] = NDArray(sshape, mxnet::Context::CPU(0), false, inputs[0][i].dtype());
+        }
+        MSHADOW_TYPE_SWITCH_WITH_BOOL(dtype, DType, {
+          // fill pad value first
+          std::fill((*outputs)[i].data().dptr<DType>(),
+                    (*outputs)[i].data().dptr<DType>() + sshape.Size(),
+                    static_cast<DType>(param_.pad_val));
+          DType *ptr = (*outputs)[i].data().dptr<DType>();
+          auto asize = ashape.Size();
+          _Pragma("omp parallel for num_threads(bs)")
+          for (size_t j = 0; j < bs; ++j) {
+            using namespace mshadow::expr;
+            auto compact_shapes = CompactShapes(ashape, inputs[j][i].shape());
+            // inputs[j][i].WaitToRead();
+            auto& fshape = compact_shapes.first;
+            auto& cshape = compact_shapes.second;
+            switch (fshape.size()) {
+              case 1U: {
+                mshadow::Tensor<cpu, 1, DType> dst = TBlob(
+                  ptr + asize * j, ashape, cpu::kDevMask, dtype, 0).get_with_shape<cpu, 1, DType>(
+                    mshadow::Shape1(fshape[0]));
+                mshadow::Tensor<cpu, 1, DType> src = inputs[j][i].data().get_with_shape<
+                  cpu, 1, DType>(mshadow::Shape1(cshape[0]));
+                slice<0>(dst, 0, cshape[0]) = src;
+                break;
+              }
+              case 2U: {
+                mshadow::Tensor<cpu, 2, DType> dst = TBlob(
+                  ptr + asize * j, ashape, cpu::kDevMask, dtype, 0).get_with_shape<cpu, 2, DType>(
+                    mshadow::Shape2(fshape[0], fshape[1]));
+                mshadow::Tensor<cpu, 2, DType> src = inputs[j][i].data().get_with_shape<
+                  cpu, 2, DType>(mshadow::Shape2(cshape[0], cshape[1]));
+                slice<1>(slice<0>(dst, 0, cshape[0]), 0, cshape[1]) = src;
+                break;
+              }
+              case 3U: {
+                mshadow::Tensor<cpu, 3, DType> dst = TBlob(
+                  ptr + asize * j, ashape, cpu::kDevMask, dtype, 0).get_with_shape<cpu, 3, DType>(
+                    mshadow::Shape3(fshape[0], fshape[1], fshape[2]));
+                mshadow::Tensor<cpu, 3, DType> src = inputs[j][i].data().get_with_shape<
+                  cpu, 3, DType>(mshadow::Shape3(cshape[0], cshape[1], cshape[2]));
+                slice<2>(slice<1>(slice<0>(dst, 0, cshape[0]), 0, cshape[1]), 0, cshape[2]) = src;
+                break;
+              }
+              case 4U: {
+                mshadow::Tensor<cpu, 4, DType> dst = TBlob(
+                  ptr + asize * j, ashape, cpu::kDevMask, dtype, 0).get_with_shape<cpu, 4, DType>(
+                    mshadow::Shape4(fshape[0], fshape[1], fshape[2], fshape[3]));
+                mshadow::Tensor<cpu, 4, DType> src = inputs[j][i].data().get_with_shape<
+                  cpu, 4, DType>(mshadow::Shape4(cshape[0], cshape[1], cshape[2], cshape[3]));
+                slice<3>(slice<2>(slice<1>(slice<0>(dst, 0, cshape[0]), 0, cshape[1]),
+                                                    0, cshape[2]), 0, cshape[3]) = src;
+                break;
+              }
+              case 5U: {
+                mshadow::Tensor<cpu, 5, DType> dst = TBlob(
+                  ptr + asize * j, ashape, cpu::kDevMask, dtype, 0).get_with_shape<cpu, 5, DType>(
+                    mshadow::Shape5(fshape[0], fshape[1], fshape[2], fshape[3], fshape[4]));
+                mshadow::Tensor<cpu, 5, DType> src = inputs[j][i].data().get_with_shape<
+                  cpu, 5, DType>(mshadow::Shape5(
+                    cshape[0], cshape[1], cshape[2], cshape[3], cshape[4]));
+                slice<4>(slice<3>(slice<2>(slice<1>(slice<0>(
+                  dst, 0, cshape[0]), 0, cshape[1]), 0, cshape[2]),
+                  0, cshape[3]), 0, cshape[4]) = src;
+                break;
+              }
+              default: {
+                LOG(FATAL) << "# dim to pad: " << cshape.size() << " exceeds limit of 5.";
+              }
+            }
+          }
+        })
+    }
+    return true;
+  }
+
+ private:
+  /*! \brief parameters */
+  PadBatchifyParam param_;
+  /*! \brief OMPException obj to store and rethrow exceptions from omp blocks*/
+  dmlc::OMPException omp_exc_;
+
+  std::pair<std::vector<dim_t>, std::vector<dim_t>> CompactShapes(const TShape& ashape,
+                                                                  const TShape& ishape) {
+    // squeeze dimensions that do not need pad
+    std::stack<dim_t> dim_stack;
+    std::vector<dim_t> full_shape;
+    std::vector<dim_t> data_shape;
+    for (dim_t k = 0; k < ishape.ndim(); ++k) {
+      if (ishape[k] == ashape[k]) {
+        dim_stack.push(ishape[k]);
+      } else {
+        dim_t ss = 1;
+        while (!dim_stack.empty()) {
+          ss *= dim_stack.top();
+          dim_stack.pop();
+        }
+        if (ss > 1) {
+          full_shape.emplace_back(ss);
+          data_shape.emplace_back(ss);
+        }
+        full_shape.emplace_back(ashape[k]);
+        data_shape.emplace_back(ishape[k]);
+      }
+    }
+    // clear the stack
+    index_t ss = 1;
+    while (!dim_stack.empty()) {
+      ss *= dim_stack.top();
+      dim_stack.pop();
+    }
+    if (ss > 1 || full_shape.empty()) {
+      full_shape.emplace_back(ss);
+      data_shape.emplace_back(ss);
+    }
+    CHECK_EQ(full_shape.size(), data_shape.size());
+    CHECK_GE(data_shape.size(), 1U);
+    return std::make_pair(full_shape, data_shape);
+  }
+};  // class PadBatchify
+
+MXNET_REGISTER_IO_BATCHIFY_FUNCTION(PadBatchify)
+  .describe(R"code(Returns the StackBatchify function.
+    )code" ADD_FILELINE)
+  .add_arguments(PadBatchifyParam::__FIELDS__())
+  .set_body([](const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    return new PadBatchify(kwargs);
+});
+}  // namespace io
+}  // namespace mxnet
diff --git a/src/io/dataloader.cc b/src/io/dataloader.cc
new file mode 100644
index 000000000000..5b47b8c01809
--- /dev/null
+++ b/src/io/dataloader.cc
@@ -0,0 +1,195 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file dataloader.cc
+ * \brief Pure c++ backed dataloader implementation
+ */
+#include <dmlc/parameter.h>
+#include <dmlc/omp.h>
+#include <mxnet/io.h>
+
+#include "./inst_vector.h"
+#include "./iter_prefetcher.h"
+#include "../profiler/custom_op_profiler.h"
+
+namespace mxnet {
+namespace io {
+struct ThreadedDataLoaderParam : public dmlc::Parameter<ThreadedDataLoaderParam> {
+  /*! \brief Multithread worker number. */
+  int num_workers;
+  /*! \brief dataset pointer.*/
+  std::intptr_t dataset;
+  /*! \brief sampler pointer.*/
+  std::intptr_t sampler;
+  /*! \brief batchify function pointer.*/
+  std::intptr_t batchify_fn;
+  /*! \brief pin memory to device id.*/
+  int pin_device_id;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(ThreadedDataLoaderParam) {
+      DMLC_DECLARE_FIELD(num_workers).set_default(0)
+          .describe("Number of thread workers.");
+      DMLC_DECLARE_FIELD(dataset)
+          .describe("Number of thread workers.");
+      DMLC_DECLARE_FIELD(sampler)
+          .describe("Number of thread workers.");
+      DMLC_DECLARE_FIELD(batchify_fn)
+          .describe("Number of thread workers.");
+      DMLC_DECLARE_FIELD(pin_device_id).set_default(-1)
+          .describe("If not negative, will move data to pinned memory.");
+  }
+};  // struct ThreadedDataLoaderParam
+
+DMLC_REGISTER_PARAMETER(ThreadedDataLoaderParam);
+
+template<typename DType = real_t>
+class ThreadedDataLoader : public IIterator<TBlobBatch> {
+ public:
+  ThreadedDataLoader() {
+  }
+  // destructor
+  virtual ~ThreadedDataLoader(void) {
+  }
+  // constructor
+  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.InitAllowUnknown(kwargs);
+    int maxthread, threadget;
+    #pragma omp parallel
+    {
+      // be conservative, set number of real cores
+      maxthread = std::max(omp_get_num_procs(), 1);
+    }
+    param_.num_workers = std::min(maxthread, param_.num_workers);
+    #pragma omp parallel num_threads(param_.num_workers)
+    {
+      threadget = omp_get_num_threads();
+    }
+    param_.num_workers = std::max(1, threadget);
+    auto dataset = *static_cast<std::shared_ptr<Dataset>*>(reinterpret_cast<void*>(param_.dataset));
+    datasets_.clear();
+    datasets_.reserve(param_.num_workers);
+    datasets_.emplace_back(dataset);
+    for (int i = 1; i < param_.num_workers; ++i) {
+      datasets_.emplace_back(std::shared_ptr<Dataset>(dataset->Clone()));
+    }
+    dataset_len_ = datasets_[0]->GetLen();
+    sampler_ = static_cast<IIterator<DataBatch>* >(reinterpret_cast<void*>(param_.sampler));
+    batchify_fn_ = *static_cast<BatchifyFunctionPtr*>(reinterpret_cast<void*>(param_.batchify_fn));
+    this->BeforeFirst();
+  }
+  // before first
+  void BeforeFirst(void) {
+    sampler_->BeforeFirst();
+  }
+
+  int64_t GetLenHint(void) const {
+    return sampler_->GetLenHint();
+  }
+
+  bool Next(void) {
+    bool has_next = sampler_->Next();
+    if (!has_next) return false;
+    auto samples = sampler_->Value();
+    auto batch_size = samples.data[0].shape().Size();
+    auto real_batch_size = batch_size - samples.num_batch_padd;
+    const int64_t *idx_ptr = static_cast<int64_t*>(
+        samples.data[0].data().dptr_);
+    std::vector<int64_t> idx_ptrs;
+    idx_ptrs.assign(idx_ptr, idx_ptr + real_batch_size);
+
+    // __getitem__
+    std::vector<std::vector<NDArray> > inputs(batch_size);
+    std::vector<int> is_scalars;
+    bool profiling = profiler::Profiler::Get()->IsProfiling(profiler::Profiler::kImperative);
+    if (profiling) {
+      profiler::CustomOpProfiler::Get()->OnCustomBegin("MXThreadedDataLoaderGetItems");
+    }
+    #pragma omp parallel for num_threads(param_.num_workers)
+    for (size_t i = 0; i < real_batch_size; ++i) {
+      omp_exc_.Run([&] {
+        auto idx = idx_ptrs[i];
+        CHECK(datasets_[i % param_.num_workers]->GetItem(idx, &inputs[i]))
+          << "Error getting data # " << idx;
+      });
+    }
+    if (profiling) {
+      profiler::CustomOpProfiler::Get()->OnCustomEnd();
+    }
+    omp_exc_.Rethrow();
+
+    // pad to normal batch size
+    for (size_t i = real_batch_size; i < batch_size; ++i) {
+      inputs[i] = inputs[0];
+    }
+
+    // batchify
+    if (profiling) {
+      profiler::CustomOpProfiler::Get()->OnCustomBegin("MXThreadedDataLoaderBatchify");
+    }
+    CHECK(batchify_fn_->Batchify(inputs, &batched_buffer_))
+      << "Error call batchify inside dataloader";
+    if (profiling) {
+      profiler::CustomOpProfiler::Get()->OnCustomEnd();
+    }
+    out_.batch_size = batched_buffer_.size();
+    out_.data.resize(batched_buffer_.size());
+    for (size_t i = 0; i < batched_buffer_.size(); ++i) {
+      out_.data[i] = batched_buffer_[i].data();
+    }
+    out_.num_batch_padd = samples.num_batch_padd;
+    return true;
+  }
+
+  const TBlobBatch &Value(void) const {
+    return out_;
+  }
+
+ private:
+  /*! \brief Params */
+  ThreadedDataLoaderParam param_;
+  /*! \brief output */
+  TBlobBatch out_;
+  /*! \brief batched buffer */
+  std::vector<NDArray> batched_buffer_;
+  /*! \brief pointer to dataset */
+  // std::shared_ptr<Dataset> dataset_;
+  std::vector<std::shared_ptr<Dataset>> datasets_;
+  /*! \brief dataset length */
+  int64_t dataset_len_;
+  /*! \brief pointer to sampler iterator */
+  IIterator<DataBatch> *sampler_;
+  /*! \brief pointer to batchify function */
+  BatchifyFunctionPtr batchify_fn_;
+  /*! \brief OMPException obj to store and rethrow exceptions from omp blocks*/
+  dmlc::OMPException omp_exc_;
+};  // class ThreadedDataLoader
+
+MXNET_REGISTER_IO_ITER(ThreadedDataLoader)
+.describe(R"code(Returns a threaded data loader iterator.
+)code" ADD_FILELINE)
+.add_arguments(ThreadedDataLoaderParam::__FIELDS__())
+.add_arguments(PrefetcherParam::__FIELDS__())
+.set_body([]() {
+    return new PrefetcherIter(
+            new ThreadedDataLoader<mxnet::real_t>());
+  });
+}  // namespace io
+}  // namespace mxnet
diff --git a/src/io/dataset.cc b/src/io/dataset.cc
new file mode 100644
index 000000000000..11cab3672a7c
--- /dev/null
+++ b/src/io/dataset.cc
@@ -0,0 +1,697 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file dataset.cc
+ * \brief High performance datasets implementation
+ */
+#include <dmlc/parameter.h>
+#include <dmlc/recordio.h>
+#include <dmlc/io.h>
+#include <mxnet/io.h>
+#include <mxnet/ndarray.h>
+#include <mxnet/tensor_blob.h>
+
+#include <string>
+#include <vector>
+#include <algorithm>
+
+#include "../imperative/cached_op.h"
+#include "../imperative/naive_cached_op.h"
+#include "../ndarray/ndarray_function.h"
+
+#if MXNET_USE_OPENCV
+#include <opencv2/opencv.hpp>
+#include "./opencv_compatibility.h"
+#endif  // MXNET_USE_OPENCV
+
+namespace mxnet {
+namespace io {
+
+struct RecordFileDatasetParam : public dmlc::Parameter<RecordFileDatasetParam> {
+  std::string rec_file;
+  std::string idx_file;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(RecordFileDatasetParam) {
+      DMLC_DECLARE_FIELD(rec_file)
+          .describe("The absolute path of record file.");
+      DMLC_DECLARE_FIELD(idx_file)
+          .describe("The path of the idx file.");
+  }
+};  // struct RecordFileDatasetParam
+
+DMLC_REGISTER_PARAMETER(RecordFileDatasetParam);
+
+class RecordFileDataset final : public Dataset {
+ public:
+  explicit RecordFileDataset(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    std::vector<std::pair<std::string, std::string> > kwargs_left;
+    param_.InitAllowUnknown(kwargs);
+    // open record file for read
+    dmlc::Stream *stream = dmlc::Stream::Create(param_.rec_file.c_str(), "r");
+    reader_ = std::make_shared<dmlc::RecordIOReader>(stream);
+    stream_.reset(stream);
+    // read and process idx file
+    dmlc::Stream *idx_stream = dmlc::Stream::Create(param_.idx_file.c_str(), "r");
+    dmlc::istream is(idx_stream);
+    size_t key, idx;
+    while (is >> key >> idx) {
+      idx_[key] = idx;
+    }
+    delete idx_stream;
+  }
+
+  RecordFileDataset* Clone(void) const {
+    auto other = new RecordFileDataset(std::vector<std::pair<std::string, std::string> >());
+    other->param_ = param_;
+    other->idx_ = idx_;
+    // do not share the pointer since it's not threadsafe to seek simultaneously
+    if (reader_ && stream_) {
+      dmlc::Stream *stream = dmlc::Stream::Create(param_.rec_file.c_str(), "r");
+      other->reader_ = std::make_shared<dmlc::RecordIOReader>(stream);
+      other->stream_.reset(stream);
+    }
+    return other;
+  }
+
+  uint64_t GetLen() const {
+    return idx_.size();
+  }
+
+  bool GetItem(uint64_t idx, std::vector<NDArray>* ret) {
+    ret->resize(1);
+    auto& out = (*ret)[0];
+    size_t pos = idx_[static_cast<size_t>(idx)];
+    {
+      std::lock_guard<std::mutex> lck(mutex_);
+      reader_->Seek(pos);
+      if (reader_->NextRecord(&read_buff_)) {
+        const char *buf = read_buff_.c_str();
+        const size_t size = read_buff_.size();
+        out = NDArray(TShape({static_cast<dim_t>(size)}), Context::CPU(), false, mshadow::kInt8);
+        TBlob dst = out.data();
+        RunContext rctx{Context::CPU(), nullptr, nullptr, false};
+        mxnet::ndarray::Copy<cpu, cpu>(
+          TBlob(const_cast<void*>(reinterpret_cast<const void*>(buf)),
+            out.shape(), cpu::kDevMask, out.dtype(), 0),
+            &dst, Context::CPU(), Context::CPU(), rctx);
+      }
+    }
+    return true;
+  }
+
+ private:
+  /*! \brief parameters */
+  RecordFileDatasetParam param_;
+  /*! \brief recordIO context */
+  std::shared_ptr<dmlc::RecordIOReader> reader_;
+  std::shared_ptr<dmlc::Stream> stream_;
+  std::string read_buff_;
+  std::mutex mutex_;
+  /*! \brief indices */
+  std::unordered_map<size_t, size_t> idx_;
+};
+
+MXNET_REGISTER_IO_DATASET(RecordFileDataset)
+  .describe("MXNet Record File Dataset")
+  .add_arguments(RecordFileDatasetParam::__FIELDS__())
+  .set_body([](const std::vector<std::pair<std::string, std::string> >& kwargs) {
+     return new RecordFileDataset(kwargs);
+});
+
+struct ImageRecordFileDatasetParam : public dmlc::Parameter<ImageRecordFileDatasetParam> {
+  std::string rec_file;
+  std::string idx_file;
+  int flag;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(ImageRecordFileDatasetParam) {
+      DMLC_DECLARE_FIELD(rec_file)
+          .describe("The absolute path of record file.");
+      DMLC_DECLARE_FIELD(idx_file)
+          .describe("The path of the idx file.");
+      DMLC_DECLARE_FIELD(flag).set_default(1)
+          .describe("If 1, always convert to colored, if 0 always convert to grayscale.");
+  }
+};  // struct ImageRecordFileDatasetParam
+
+DMLC_REGISTER_PARAMETER(ImageRecordFileDatasetParam);
+
+#if MXNET_USE_OPENCV
+template<int n_channels>
+void SwapImageChannels(const cv::Mat &img, NDArray* arr) {
+  int swap_indices[n_channels]; // NOLINT(*)
+  if (n_channels == 1) {
+    swap_indices[0] = 0;
+  } else if (n_channels == 3) {
+    swap_indices[0] = 2;
+    swap_indices[1] = 1;
+    swap_indices[2] = 0;
+  } else if (n_channels == 4) {
+    swap_indices[0] = 2;
+    swap_indices[1] = 1;
+    swap_indices[2] = 0;
+    swap_indices[3] = 3;
+  }
+
+  TShape arr_shape = TShape({img.rows, img.cols, n_channels});
+  if (arr->is_none() || arr->shape() != arr_shape || arr->ctx() != mxnet::Context::CPU(0) ||
+      arr->dtype() != mshadow::kUint8 || arr->storage_type() != kDefaultStorage) {
+    *arr = NDArray(arr_shape, mxnet::Context::CPU(0), false, mshadow::kUint8);
+  }
+  auto ptr = static_cast<uint8_t*>(arr->data().dptr_);
+
+  // swap channels while copying elements into buffer
+  for (int i = 0; i < img.rows; ++i) {
+    const uint8_t* im_data = img.ptr<uint8_t>(i);
+    uint8_t* buffer_data = ptr + i * img.cols * n_channels;
+    for (int j = 0; j < img.cols; ++j) {
+      for (int k = 0; k < n_channels; ++k) {
+        buffer_data[k] = im_data[swap_indices[k]];
+      }
+      im_data += n_channels;
+      buffer_data += n_channels;
+    }
+  }
+}
+#endif
+
+/*! \brief Struct for unpack recordio header */
+#pragma pack(1)
+struct IRHeader {
+  uint32_t flag;
+  float label;
+  uint64_t id;
+  uint64_t id2;
+};  // struct IRHeader
+
+class ImageRecordFileDataset : public Dataset {
+ public:
+  explicit ImageRecordFileDataset(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    std::vector<std::pair<std::string, std::string> > kwargs_left;
+    param_.InitAllowUnknown(kwargs);
+    base_ = std::make_shared<RecordFileDataset>(kwargs);
+  }
+
+  ImageRecordFileDataset* Clone(void) const {
+    auto other = new ImageRecordFileDataset(std::vector<std::pair<std::string, std::string> >());
+    other->param_ = param_;
+    other->base_.reset(base_->Clone());
+    return other;
+  }
+
+  uint64_t GetLen() const {
+    return base_->GetLen();
+  }
+
+  bool GetItem(uint64_t idx, std::vector<NDArray>* ret) {
+    CHECK_LT(idx, GetLen());
+    std::vector<NDArray> raw;
+    if (!base_->GetItem(idx, &raw)) return false;
+    CHECK_EQ(raw.size(), 1U) << "RecordFileDataset should return size 1 NDArray vector";
+    uint8_t *s = reinterpret_cast<uint8_t*>(raw[0].data().dptr_);
+    size_t size = raw[0].shape().Size();
+    CHECK_GT(size, sizeof(IRHeader)) << "Invalid size of bytes from Record File";
+    IRHeader header;
+    std::memcpy(&header, s, sizeof(header));
+    size -= sizeof(header);
+    s += sizeof(header);
+    NDArray label = NDArray(Context::CPU(), mshadow::default_type_flag);
+    RunContext rctx{Context::CPU(), nullptr, nullptr, false};
+    if (header.flag > 0) {
+      auto label_shape = header.flag <= 1 ? TShape(0, 1) : TShape({header.flag});
+      label.ReshapeAndAlloc(label_shape);
+      TBlob dst = label.data();
+      mxnet::ndarray::Copy<cpu, cpu>(
+        TBlob(reinterpret_cast<void*>(s), label.shape(), cpu::kDevMask, label.dtype(), 0),
+        &dst, Context::CPU(), Context::CPU(), rctx);
+      s += sizeof(float) * header.flag;
+      size -= sizeof(float) * header.flag;
+    } else {
+      // label is a scalar with ndim() == 0
+      label.ReshapeAndAlloc(TShape(0, 1));
+      TBlob dst = label.data();
+      *(dst.dptr<float>()) = header.label;
+    }
+    ret->resize(2);
+    (*ret)[1] = label;
+#if MXNET_USE_OPENCV
+    cv::Mat buf(1, size, CV_8U, s);
+    cv::Mat res = cv::imdecode(buf, param_.flag);
+    CHECK(!res.empty()) << "Decoding failed. Invalid image file.";
+    const int n_channels = res.channels();
+    if (n_channels == 1) {
+      SwapImageChannels<1>(res, &(ret->at(0)));
+    } else if (n_channels == 3) {
+      SwapImageChannels<3>(res, &(ret->at(0)));
+    } else if (n_channels == 4) {
+      SwapImageChannels<4>(res, &(ret->at(0)));
+    }
+    return true;
+#else
+  LOG(FATAL) << "Opencv is needed for image decoding.";
+#endif
+  return false;  // should not reach here
+  }
+
+ private:
+  /*! \brief parameters */
+  ImageRecordFileDatasetParam param_;
+  /*! \brief base recordIO reader */
+  std::shared_ptr<RecordFileDataset> base_;
+};
+
+MXNET_REGISTER_IO_DATASET(ImageRecordFileDataset)
+  .describe("MXNet Image Record File Dataset")
+  .add_arguments(ImageRecordFileDatasetParam::__FIELDS__())
+  .set_body([](const std::vector<std::pair<std::string, std::string> >& kwargs) {
+     return new ImageRecordFileDataset(kwargs);
+});
+
+struct ImageSequenceDatasetParam : public dmlc::Parameter<ImageSequenceDatasetParam> {
+  /*! \brief the list of absolute image paths, separated by \0 characters */
+  std::string img_list;
+  /*! \brief the path separator character, by default it's ; */
+  char path_sep;
+  /*! \brief If flag is 0, always convert to grayscale(1 channel).
+  * If flag is 1, always convert to colored (3 channels).
+  * If flag is -1, keep channels unchanged.
+  */
+  int flag;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(ImageSequenceDatasetParam) {
+      DMLC_DECLARE_FIELD(img_list)
+          .describe("The list of image absolute paths.");
+      DMLC_DECLARE_FIELD(path_sep).set_default('|')
+          .describe("The path separator for joined image paths.");
+      DMLC_DECLARE_FIELD(flag).set_default(1)
+          .describe("If 1, always convert to colored, if 0 always convert to grayscale.");
+  }
+};  // struct ImageSequenceDatasetParam
+
+DMLC_REGISTER_PARAMETER(ImageSequenceDatasetParam);
+
+class ImageSequenceDataset final : public Dataset {
+ public:
+  explicit ImageSequenceDataset(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    std::vector<std::pair<std::string, std::string> > kwargs_left;
+    param_.InitAllowUnknown(kwargs);
+    img_list_ = dmlc::Split(param_.img_list, param_.path_sep);
+  }
+
+  ImageSequenceDataset* Clone(void) const {
+    return new ImageSequenceDataset(*this);
+  }
+
+  uint64_t GetLen() const {
+    return img_list_.size();
+  }
+
+  bool GetItem(uint64_t idx, std::vector<NDArray>* ret) {
+#if MXNET_USE_OPENCV
+    CHECK_LT(idx, img_list_.size())
+      << "GetItem index: " << idx << " out of bound: " << img_list_.size();
+    cv::Mat res = cv::imread(img_list_[idx], param_.flag);
+    CHECK(!res.empty()) << "Decoding failed. Invalid image file.";
+    const int n_channels = res.channels();
+    ret->resize(1);
+    if (n_channels == 1) {
+      SwapImageChannels<1>(res, &(ret->at(0)));
+    } else if (n_channels == 3) {
+      SwapImageChannels<3>(res, &(ret->at(0)));
+    } else if (n_channels == 4) {
+      SwapImageChannels<4>(res, &(ret->at(0)));
+    }
+    return true;
+#else
+  LOG(FATAL) << "Opencv is needed for image decoding.";
+#endif
+  return false;
+  }
+
+ private:
+  /*! \brief parameters */
+  ImageSequenceDatasetParam param_;
+  /*! \brief image list */
+  std::vector<std::string> img_list_;
+};
+
+MXNET_REGISTER_IO_DATASET(ImageSequenceDataset)
+  .describe("Image Sequence Dataset")
+  .add_arguments(ImageSequenceDatasetParam::__FIELDS__())
+  .set_body([](const std::vector<std::pair<std::string, std::string> >& kwargs) {
+     return new ImageSequenceDataset(kwargs);
+});
+
+struct NDArrayDatasetParam : public dmlc::Parameter<NDArrayDatasetParam> {
+  /*! \brief the source ndarray */
+  std::intptr_t arr;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(NDArrayDatasetParam) {
+      DMLC_DECLARE_FIELD(arr)
+          .describe("Pointer to NDArray.");
+  }
+};  // struct NDArrayDatasetParam
+
+DMLC_REGISTER_PARAMETER(NDArrayDatasetParam);
+
+class NDArrayDataset final : public Dataset {
+ public:
+  explicit NDArrayDataset(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.InitAllowUnknown(kwargs);
+    data_ = *(static_cast<NDArray*>(reinterpret_cast<void*>(param_.arr)));
+    if (data_.shape().ndim() < 1) {
+      LOG(FATAL) << "NDArray with no dim is not iterable";
+    }
+    size_ = data_.shape().begin()[0];
+  }
+
+  NDArrayDataset* Clone(void) const {
+    return new NDArrayDataset(*this);
+  }
+
+  uint64_t GetLen() const {
+    return size_;
+  }
+
+  bool GetItem(uint64_t idx, std::vector<NDArray>* rets) {
+    CHECK_LT(idx, size_)
+      << "GetItem index: " << idx << " out of bound: " << size_;
+    rets->resize(1);
+    auto& ret = (*rets)[0];
+    ret = data_.Slice(idx, idx + 1);
+    if (ret.shape().ndim() > 1) {
+      // remove first dim to be consistent with numpy
+      TShape new_shape;
+      new_shape.assign(ret.shape().begin() + 1, ret.shape().end());
+      ret = ret.Reshape(new_shape);
+    } else {
+      if (data_.shape().ndim() == 1) {
+        // scalar
+        TShape new_shape(0, 1);
+        ret = ret.Reshape(new_shape);
+      }
+    }
+    return true;
+  }
+
+ private:
+  /*! \brief parameters */
+  NDArrayDatasetParam param_;
+  /*! \brief stored ndarray */
+  NDArray data_;
+  /*! \brief stored ndarray shape */
+  int64_t size_;
+};  // class NDArrayDataset
+
+MXNET_REGISTER_IO_DATASET(NDArrayDataset)
+  .describe("Single NDArray Dataset")
+  .add_arguments(NDArrayDatasetParam::__FIELDS__())
+  .set_body([](const std::vector<std::pair<std::string, std::string> >& kwargs) {
+     return new NDArrayDataset(kwargs);
+});
+
+struct GroupDatasetParam : public dmlc::Parameter<GroupDatasetParam> {
+  /*! \brief the source ndarray */
+  Tuple<std::intptr_t> datasets;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(GroupDatasetParam) {
+      DMLC_DECLARE_FIELD(datasets)
+          .describe("A small set of pointers to other c++ datasets.");
+  }
+};  // struct GroupDatasetParam
+
+DMLC_REGISTER_PARAMETER(GroupDatasetParam);
+
+class GroupDataset final : public Dataset {
+ public:
+  explicit GroupDataset(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    std::vector<std::pair<std::string, std::string> > kwargs_left;
+    param_.InitAllowUnknown(kwargs);
+    auto childs = param_.datasets;
+    childs_.reserve(childs.ndim());
+    size_t child_cnt = 0;
+    for (auto child : childs) {
+      auto d = *static_cast<std::shared_ptr<Dataset>*>(reinterpret_cast<void*>(child));
+      if (child_cnt == 0) {
+        size_ = d->GetLen();
+      } else {
+        CHECK_EQ(size_, d->GetLen())
+          << "All child dataset of GroupDataset must be identical "
+          << "Given mismatch: " << size_ << " vs " << d->GetLen();
+      }
+      childs_.emplace_back(d);
+      child_cnt++;
+    }
+  }
+
+  GroupDataset* Clone(void) const {
+    return new GroupDataset(*this);
+  }
+
+  uint64_t GetLen() const {
+    return size_;
+  }
+
+  bool GetItem(uint64_t idx, std::vector<NDArray>* ret) {
+    CHECK_LT(idx, size_)
+      << "GetItem index: " << idx << " out of bound: " << size_;
+    ret->clear();
+    for (auto child : childs_) {
+      std::vector<NDArray> temp_ret;
+      if (!child->GetItem(idx, &temp_ret)) return false;
+      ret->insert(ret->end(), temp_ret.begin(), temp_ret.end());
+    }
+    return true;
+  }
+
+ private:
+  /*! \brief parameters */
+  GroupDatasetParam param_;
+  /*! \brief stored child datasets */
+  std::vector<std::shared_ptr<Dataset>> childs_;
+  /*! \brief overall dataset size, equals to all child datasets */
+  uint64_t size_;
+};   // class GroupDataset
+
+MXNET_REGISTER_IO_DATASET(GroupDataset)
+  .describe("Grouped Dataset that combine a bunch of datasets")
+  .add_arguments(GroupDatasetParam::__FIELDS__())
+  .set_body([](const std::vector<std::pair<std::string, std::string> >& kwargs) {
+     return new GroupDataset(kwargs);
+});
+
+struct IndexedDatasetParam : public dmlc::Parameter<IndexedDatasetParam> {
+  /*! \brief the base dataset */
+  std::intptr_t base;
+  /*! \brief the indices */
+  Tuple<uint64_t> indices;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(IndexedDatasetParam) {
+      DMLC_DECLARE_FIELD(base)
+          .describe("Pointer to the internal c++ dataset that is going to be indexed.");
+      DMLC_DECLARE_FIELD(indices)
+          .describe("The indices for the internal dataset. Output[i] will be base[indices[i]].");
+  }
+};  // struct IndexedDatasetParam
+
+DMLC_REGISTER_PARAMETER(IndexedDatasetParam);
+
+class IndexedDataset final : public Dataset {
+ public:
+  explicit IndexedDataset(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.InitAllowUnknown(kwargs);
+    base_data_ = *static_cast<std::shared_ptr<Dataset>*>(reinterpret_cast<void*>(param_.base));
+  }
+
+  IndexedDataset* Clone(void) const {
+    return new IndexedDataset(*this);
+  }
+
+  uint64_t GetLen() const {
+    return param_.indices.ndim();
+  }
+
+  bool GetItem(uint64_t idx, std::vector<NDArray>* ret) {
+    CHECK_GT(param_.indices.ndim(), idx) << "IndexError: " << idx
+      << " from total: " << param_.indices.ndim();
+    auto new_idx = param_.indices[idx];
+    CHECK_GT(base_data_->GetLen(), new_idx) << "IndexError: " << new_idx
+      << " from original dataset with size: " << base_data_->GetLen();
+    return base_data_->GetItem(new_idx, ret);
+  }
+
+ private:
+  /*! \brief parameters */
+  IndexedDatasetParam param_;
+  /*! \brief stored child dataset */
+  std::shared_ptr<Dataset> base_data_;
+};   // class IndexedDataset
+
+MXNET_REGISTER_IO_DATASET(IndexedDataset)
+  .describe("Grouped Dataset that combine a bunch of datasets")
+  .add_arguments(IndexedDatasetParam::__FIELDS__())
+  .set_body([](const std::vector<std::pair<std::string, std::string> >& kwargs) {
+     return new IndexedDataset(kwargs);
+});
+
+struct LazyTransformDatasetParam : public dmlc::Parameter<LazyTransformDatasetParam> {
+  /*! \brief the source ndarray */
+  std::intptr_t cached_op;
+  /*! \brief internal dataset */
+  std::intptr_t dataset;
+  /*! \brief indices for items that needs transformation */
+  Tuple<int> transform_indices;
+  /*! \brief is_scalar information for outputs */
+  Tuple<int> scalar_outputs;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(LazyTransformDatasetParam) {
+      DMLC_DECLARE_FIELD(cached_op)
+          .describe("Pointer to cached transform function.");
+      DMLC_DECLARE_FIELD(dataset)
+          .describe("Pointer to internal dataset.");
+      DMLC_DECLARE_FIELD(transform_indices).set_default(Tuple<int>({}))
+          .describe("The indices for dataset items that need to be transformed/processed. "
+                    "If `transform_indices` is empty(default), "
+                    "then all items will be processed.");
+      DMLC_DECLARE_FIELD(scalar_outputs)
+          .describe("Indicate whether outputs are scalars, the size must match the output size.");
+  }
+};  // struct LazyTransformDatasetParam
+
+DMLC_REGISTER_PARAMETER(LazyTransformDatasetParam);
+
+class LazyTransformDataset final : public Dataset {
+ public:
+  LazyTransformDataset(const LazyTransformDataset& other) {
+    this->param_ = other.param_;
+    this->pass_through_indices_ = other.pass_through_indices_;
+    this->use_input_indices_ = other.use_input_indices_;
+    this->num_outputs_ = other.num_outputs_;
+    this->cached_op_ = NaiveCachedOpPtr(new NaiveCachedOp(
+      other.cached_op_->sym_, other.cached_op_->flags_));
+    this->base_data_ = other.base_data_;
+  }
+
+  explicit LazyTransformDataset(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.InitAllowUnknown(kwargs);
+    auto op = *static_cast<CachedOpPtr*>(reinterpret_cast<void*>(param_.cached_op));
+    cached_op_ = NaiveCachedOpPtr(new NaiveCachedOp(op->sym_, op->flags_));
+    base_data_ = *static_cast<std::shared_ptr<Dataset>*>(reinterpret_cast<void*>(param_.dataset));
+
+    // use first item to calculate size info
+    CHECK_GT(GetLen(), 0)
+      << "LazyTransformDataset expect the base dataset to have at least 1 item";
+    std::vector<NDArray> inputs;
+    CHECK(base_data_->GetItem(0, &inputs));
+    // check output size
+    CHECK_EQ(param_.scalar_outputs.ndim(), cached_op_->num_outputs())
+      << "Output scalar info size: " << param_.scalar_outputs.ndim() << " vs. output size: "
+      << cached_op_->num_outputs() << " mismatch!";
+    // check input size
+    if (param_.transform_indices.ndim() == 0) {
+      std::vector<int> default_indices;
+      default_indices.reserve(cached_op_->num_inputs());
+      for (size_t i = 0; i < cached_op_->num_inputs(); ++i) {
+        default_indices.emplace_back(static_cast<int>(i));
+      }
+      use_input_indices_ = default_indices;
+    } else {
+      use_input_indices_ = std::vector<int>(param_.transform_indices.begin(),
+                                            param_.transform_indices.end());
+    }
+    CHECK_EQ(use_input_indices_.size(), cached_op_->num_inputs())
+      << "Mismatched transform indices and transform inputs: " << use_input_indices_.size()
+      << " vs. " << cached_op_->num_inputs();
+    auto num_inputs = use_input_indices_.size();
+    CHECK_GE(inputs.size(), num_inputs)
+      << "LazyTransformDataset input size " << inputs.size()
+      << " smaller than transform input size: "
+      << num_inputs;
+    pass_through_indices_.clear();
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      // filling output ndarray from unaltered inputs, transformed outputs are already inserted
+      if (std::find(use_input_indices_.begin(),
+                    use_input_indices_.end(), i) == use_input_indices_.end()) {
+        pass_through_indices_.emplace_back(i);
+      }
+    }
+    num_outputs_ = inputs.size() + cached_op_->num_outputs() - cached_op_->num_inputs();
+  }
+
+  virtual ~LazyTransformDataset(void) {
+  }
+
+  LazyTransformDataset* Clone(void) const {
+    return new LazyTransformDataset(*this);
+  }
+
+  uint64_t GetLen() const {
+    return base_data_->GetLen();
+  }
+
+  bool GetItem(uint64_t idx, std::vector<NDArray>* outputs) {
+    std::vector<NDArray> inputs;
+    if (!base_data_->GetItem(idx, &inputs)) return false;
+    outputs->reserve(num_outputs_);
+    outputs->resize(cached_op_->num_outputs());
+    for (auto i : pass_through_indices_) {
+      outputs->emplace_back(inputs[i]);
+    }
+    CHECK_EQ(outputs->size(), num_outputs_);
+    // workspace for cached op
+    std::vector<NDArray*> ndinputs;
+    std::vector<NDArray*> ndoutputs;
+    ndinputs.reserve(inputs.size());
+    for (size_t i = 0; i < use_input_indices_.size(); ++i) {
+      ndinputs.emplace_back(&(inputs[use_input_indices_[i]]));
+    }
+    ndoutputs.reserve(cached_op_->num_outputs());
+    CHECK_LE(cached_op_->num_outputs(), outputs->size());
+    for (size_t i = 0; i < cached_op_->num_outputs(); ++i) {
+      ndoutputs.emplace_back(&(outputs->at(i)));
+    }
+
+    for (size_t i = 0; i < inputs.size(); ++i) {
+      inputs[i].WaitToRead();
+    }
+    cached_op_->Forward(cached_op_, ndinputs, ndoutputs);
+    return true;
+  }
+
+ private:
+  /*! \brief parameters */
+  LazyTransformDatasetParam param_;
+  /*! \brief stored cached op */
+  NaiveCachedOpPtr cached_op_;
+  /*! \brief internal dataset */
+  std::shared_ptr<Dataset> base_data_;
+  std::vector<int> use_input_indices_;
+  std::vector<int> pass_through_indices_;
+  size_t num_outputs_;
+};   // class LazyTransformDataset
+
+MXNET_REGISTER_IO_DATASET(LazyTransformDataset)
+  .describe("Dataset that apply lazy transformation to internal dataset")
+  .add_arguments(LazyTransformDatasetParam::__FIELDS__())
+  .set_body([](const std::vector<std::pair<std::string, std::string> >& kwargs) {
+     return new LazyTransformDataset(kwargs);
+});
+}  // namespace io
+}  // namespace mxnet
diff --git a/src/io/image_iter_common.h b/src/io/image_iter_common.h
index 5e5bbe05d308..edda3d9dfe97 100644
--- a/src/io/image_iter_common.h
+++ b/src/io/image_iter_common.h
@@ -194,6 +194,41 @@ struct BatchParam : public dmlc::Parameter<BatchParam> {
   }
 };
 
+// Batch Sampler parameters
+struct BatchSamplerParam : public dmlc::Parameter<BatchSamplerParam> {
+  /*! \brief Last batch behavior type */
+  enum LastBatchType {
+    /*! \brief Keep not fully filled last batch */
+    kKeep = 0,
+    /*! \brief Roll over the remaining batch to next epoch */
+    kRollOver,
+    /*! \brief Discard not fully filled last batch */
+    kDiscard
+  };  // enum LastBatchType
+  /*! \brief batch size */
+  uint32_t batch_size;
+  /*! \brief last batch behavior */
+  int last_batch;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(BatchSamplerParam) {
+    DMLC_DECLARE_FIELD(batch_size)
+        .describe("Batch size.");
+    DMLC_DECLARE_FIELD(last_batch).set_default(kKeep)
+        .add_enum("keep", kKeep)
+        .add_enum("rollover", kRollOver)
+        .add_enum("discard", kDiscard)
+        .describe("Specifies how the last batch is handled if batch_size does not evenly "
+                  "divide sequence length. "
+                  "If 'keep', the last batch will be returned directly, but will contain "
+                  "less element than `batch_size` requires. "
+                  "If 'discard', the last batch will be discarded. "
+                  "If 'rollover', the remaining elements will be rolled over to the next "
+                  "iteration. Note: legacy batch param with round_batch will always round data "
+                  "in order to always provide full batchs. Rollover behavior will instead result "
+                  "in different iteration sizes for each epoch.");
+  }
+};
+
 // Define image record parameters
 struct ImageRecordParam: public dmlc::Parameter<ImageRecordParam> {
   /*! \brief whether to do shuffle */
@@ -346,13 +381,13 @@ struct ImageDetNormalizeParam :  public dmlc::Parameter<ImageDetNormalizeParam>
 
 // Define prefetcher parameters
 struct PrefetcherParam : public dmlc::Parameter<PrefetcherParam> {
-  enum CtxType { kGPU = 0, kCPU};
+  enum CtxType { kGPU = 0, kCPU, kCPUPinned, kCPUShared};
   /*! \brief number of prefetched batches */
   size_t prefetch_buffer;
 
   /*! \brief Context data loader optimized for */
   int ctx;
-
+  int device_id;
   /*! \brief data type */
   dmlc::optional<int> dtype;
 
@@ -363,7 +398,14 @@ struct PrefetcherParam : public dmlc::Parameter<PrefetcherParam> {
     DMLC_DECLARE_FIELD(ctx).set_default(kGPU)
         .add_enum("cpu", kCPU)
         .add_enum("gpu", kGPU)
-        .describe("Context data loader optimized for.");
+        .add_enum("cpu_pinned", kCPUPinned)
+        .describe("Context data loader optimized for. "
+                  "Note that it only indicates the optimization strategy for devices, "
+                  "by no means the prefetcher will load data to GPUs. "
+                  "If ctx is 'cpu_pinned' and device_id is not -1, "
+                  "it will use cpu_pinned(device_id) as ctx");
+    DMLC_DECLARE_FIELD(device_id).set_default(-1)
+        .describe("The default device id for context. -1 indicate it's on default device");
     DMLC_DECLARE_FIELD(dtype)
       .add_enum("float32", mshadow::kFloat32)
       .add_enum("float64", mshadow::kFloat64)
diff --git a/src/io/io.cc b/src/io/io.cc
index b92f02e160fc..33131fd291d0 100644
--- a/src/io/io.cc
+++ b/src/io/io.cc
@@ -27,12 +27,15 @@
 // Registers
 namespace dmlc {
 DMLC_REGISTRY_ENABLE(::mxnet::DataIteratorReg);
+DMLC_REGISTRY_ENABLE(::mxnet::DatasetReg);
+DMLC_REGISTRY_ENABLE(::mxnet::BatchifyFunctionReg);
 }  // namespace dmlc
 
 namespace mxnet {
 namespace io {
 // Register parameters in header files
 DMLC_REGISTER_PARAMETER(BatchParam);
+DMLC_REGISTER_PARAMETER(BatchSamplerParam);
 DMLC_REGISTER_PARAMETER(PrefetcherParam);
 DMLC_REGISTER_PARAMETER(ImageNormalizeParam);
 DMLC_REGISTER_PARAMETER(ImageRecParserParam);
diff --git a/src/io/iter_batchloader.h b/src/io/iter_batchloader.h
index 279690b594e6..7532033fefdd 100644
--- a/src/io/iter_batchloader.h
+++ b/src/io/iter_batchloader.h
@@ -171,6 +171,147 @@ class BatchLoader : public IIterator<TBlobBatch> {
     }
   }
 };  // class BatchLoader
+
+/*! \brief create a batch sampler from single instance iterator
+ *  Unlike BatchLoader, BatchSampler will handle flexible length during iteration.
+ */
+class BatchSampler : public IIterator<DataBatch> {
+ public:
+  explicit BatchSampler(IIterator<DataInst> *base):
+    num_overflow_(0), base_(base) {
+  }
+
+  virtual ~BatchSampler(void) {
+    delete base_;
+  }
+
+  inline void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    std::vector<std::pair<std::string, std::string> > kwargs_left;
+    // init batch param, it could have similar param with
+    kwargs_left = param_.InitAllowUnknown(kwargs);
+    // Init space for out
+    out_.data.clear();
+    // init base iterator
+    base_->Init(kwargs);
+  }
+
+  virtual void BeforeFirst(void) {
+    if (param_.last_batch != param_.kRollOver || num_overflow_ == 0) {
+      // otherise, we already called before first
+      base_->BeforeFirst();
+    }
+  }
+
+  virtual int64_t GetLenHint(void) const {
+    auto base_hint = base_->GetLenHint();
+    if (base_hint < 0) {
+      return base_hint;
+    } else if (param_.kKeep == param_.last_batch) {
+      return (base_hint + param_.batch_size - 1) / param_.batch_size;
+    } else if (param_.kDiscard == param_.last_batch) {
+      return base_hint / param_.batch_size;
+    } else if (param_.kRollOver == param_.last_batch) {
+      return (base_hint + num_overflow_) / param_.batch_size;
+    } else {
+      LOG(FATAL) << "last_batch must be one of 'keep', 'discard', or 'rollover'"
+        << " but got: " << param_.last_batch;
+    }
+    return -1;
+  }
+
+  virtual bool Next(void) {
+    out_.num_batch_padd = 0;
+
+    size_t top = num_overflow_;  // start with last overflow index
+
+    while (base_->Next()) {
+      const DataInst& d = base_->Value();
+      // out_.inst_index[top] = d.index;
+      if (data_.size() == 0) {
+        this->InitData(d);
+      }
+      for (size_t i = 0; i < d.data.size(); ++i) {
+        CHECK_EQ(unit_size_[i], d.data[i].Size());
+        MSHADOW_TYPE_SWITCH(data_[i].type_flag_, DType, {
+            mshadow::Copy(
+              data_[i].get<cpu, 1, DType>().Slice(top * unit_size_[i],
+                                                  (top + 1) * unit_size_[i]),
+              d.data[i].get_with_shape<cpu, 1, DType>(mshadow::Shape1(unit_size_[i])));
+          });
+      }
+      if (++top >= param_.batch_size) {
+        num_overflow_ = 0;
+        return true;
+      }
+    }
+    if (top != 0) {
+      if (param_.last_batch == param_.kDiscard) {
+        // discard the batch
+        num_overflow_ = 0;
+        return false;
+      } else if (param_.last_batch == param_.kKeep) {
+        out_.num_batch_padd = param_.batch_size - top;
+        num_overflow_ = 0;
+        return true;
+      } else if (param_.last_batch == param_.kRollOver) {
+        if (num_overflow_ > 0) {
+          base_->BeforeFirst();
+          num_overflow_ = top;
+          return this->Next();
+        } else {
+          num_overflow_ = top;
+          return false;
+        }
+      } else {
+        LOG(FATAL) << "Unknown last_batch type: " << param_.last_batch;
+      }
+    }
+    return false;
+  }
+  virtual const DataBatch &Value(void) const {
+    return out_;
+  }
+
+ protected:
+  /*! \brief batch parameters */
+  BatchSamplerParam param_;
+  /*! \brief output data */
+  DataBatch out_;
+  /*! \brief number of overflow instances that readed in round_batch mode */
+  int num_overflow_;
+  /*! \brief tensor to hold data */
+  std::vector<TBlobContainer> data_;
+
+ private:
+  /*! \brief base iterator */
+  IIterator<DataInst> *base_;
+  /*! \brief data shape */
+  mxnet::ShapeVector shape_;
+  /*! \brief unit size */
+  std::vector<size_t> unit_size_;
+  // initialize the data holder by using from the first batch.
+  inline void InitData(const DataInst& first_batch) {
+    shape_.resize(first_batch.data.size());
+    data_.resize(first_batch.data.size());
+    unit_size_.resize(first_batch.data.size());
+    for (size_t i = 0; i < first_batch.data.size(); ++i) {
+      mxnet::TShape src_shape = first_batch.data[i].shape_;
+      int src_type_flag = first_batch.data[i].type_flag_;
+      // init object attributes
+      std::vector<index_t> shape_vec;
+      shape_vec.push_back(param_.batch_size);
+      for (index_t dim = 0; dim < src_shape.ndim(); ++dim) {
+        shape_vec.push_back(src_shape[dim]);
+      }
+      mxnet::TShape dst_shape(shape_vec.begin(), shape_vec.end());
+      shape_[i] = dst_shape;
+      data_[i].resize(mshadow::Shape1(dst_shape.Size()), src_type_flag);
+      unit_size_[i] = src_shape.Size();
+      out_.data.push_back(NDArray(TBlob(
+        data_[i].dptr_, dst_shape, cpu::kDevMask, src_type_flag, 0), 0));
+    }
+  }
+};  // class BatchSampler
 }  // namespace io
 }  // namespace mxnet
 #endif  // MXNET_IO_ITER_BATCHLOADER_H_
diff --git a/src/io/iter_prefetcher.h b/src/io/iter_prefetcher.h
index fdd1d2b91925..dae11d36e252 100644
--- a/src/io/iter_prefetcher.h
+++ b/src/io/iter_prefetcher.h
@@ -47,7 +47,7 @@ namespace io {
 class PrefetcherIter : public IIterator<DataBatch> {
  public:
   explicit PrefetcherIter(IIterator<TBlobBatch>* base)
-      : loader_(base), out_(nullptr) {}
+      : loader_(base), out_(nullptr), length_hint_(-1) {}
 
   ~PrefetcherIter() {
     while (recycle_queue_.size() != 0) {
@@ -63,6 +63,7 @@ class PrefetcherIter : public IIterator<DataBatch> {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
     // init image rec param
     kwargs_left = param_.InitAllowUnknown(kwargs);
+    CHECK_GT(param_.prefetch_buffer, 0) << "Prefetch_buffer must be positive number";
     // maximum prefetch threaded iter internal size
     const int kMaxPrefetchBuffer = 16;
     // init thread iter
@@ -73,6 +74,7 @@ class PrefetcherIter : public IIterator<DataBatch> {
     InitParams(kwargs);
     // use the kwarg to init batch loader
     loader_->Init(kwargs);
+    length_hint_ = loader_->GetLenHint();
     iter.Init([this](DataBatch **dptr) {
         if (!loader_->Next()) return false;
         const TBlobBatch& batch = loader_->Value();
@@ -86,14 +88,20 @@ class PrefetcherIter : public IIterator<DataBatch> {
             auto dtype = param_.dtype
                              ? param_.dtype.value()
                              : batch.data[i].type_flag_;
+            auto ctx = ((param_.ctx == PrefetcherParam::kCPUPinned) && (param_.device_id >= 0)) ?
+              Context::CPUPinned(param_.device_id) : Context::CPU();
             (*dptr)->data.at(i) = NDArray(batch.data[i].shape_,
-                                          Context::CPU(), false,
+                                          ctx, false,
                                           dtype);
           }
         }
         CHECK(batch.data.size() == (*dptr)->data.size());
         // copy data over
         for (size_t i = 0; i < batch.data.size(); ++i) {
+          if ((*dptr)->data.at(i).shape() != batch.data[i].shape_) {
+            // perf warning, dynamic buffer might be slow
+            (*dptr)->data.at(i).ReshapeAndAlloc(batch.data[i].shape_);
+          }
           CHECK_EQ((*dptr)->data.at(i).shape(), batch.data[i].shape_);
           MSHADOW_TYPE_SWITCH(batch.data[i].type_flag_, DType, {
               mshadow::Copy(((*dptr)->data)[i].data().FlatTo2D<cpu, DType>(),
@@ -108,13 +116,17 @@ class PrefetcherIter : public IIterator<DataBatch> {
         }
        return true;
       },
-      [this]() { loader_->BeforeFirst(); });
+      [this]() { loader_->BeforeFirst(); length_hint_ = loader_->GetLenHint();});
   }
 
   virtual void BeforeFirst(void) {
     iter.BeforeFirst();
   }
 
+  virtual int64_t GetLenHint(void) const {
+    return length_hint_;
+  }
+
   virtual bool Next(void) {
     if (out_ != nullptr) {
       recycle_queue_.push(out_); out_ = nullptr;
@@ -148,6 +160,8 @@ class PrefetcherIter : public IIterator<DataBatch> {
   DataBatch *out_;
   /*! \brief queue to be recycled */
   std::queue<DataBatch*> recycle_queue_;
+  /*! \brief size hint cache */
+  int64_t length_hint_;
 };
 }  // namespace io
 }  // namespace mxnet
diff --git a/src/io/iter_sampler.cc b/src/io/iter_sampler.cc
new file mode 100644
index 000000000000..daf3f1c8fec4
--- /dev/null
+++ b/src/io/iter_sampler.cc
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Copyright (c) 2020 by Contributors
+ * \file iter_sampler.cc
+ * \brief The sampler iterator for access dataset elements.
+ */
+#include <dmlc/parameter.h>
+#include <mshadow/random.h>
+#include <mxnet/io.h>
+#include <mxnet/base.h>
+#include <mxnet/resource.h>
+#include "../common/utils.h"
+#include "./iter_batchloader.h"
+#include "./iter_prefetcher.h"
+
+namespace mxnet {
+namespace io {
+struct SequentialSamplerParam : public dmlc::Parameter<SequentialSamplerParam> {
+  /*! \brief Length of the sequence. */
+  size_t length;
+  /*! \brief Random seed.*/
+  int start;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(SequentialSamplerParam) {
+      DMLC_DECLARE_FIELD(length)
+          .describe("Length of the sequence.");
+      DMLC_DECLARE_FIELD(start).set_default(0)
+          .describe("Start of the index.");
+  }
+};  // struct SequentialSamplerParam
+
+DMLC_REGISTER_PARAMETER(SequentialSamplerParam);
+
+class SequentialSampler : public IIterator<DataInst> {
+ public:
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.InitAllowUnknown(kwargs);
+    indices_.resize(param_.length);
+    std::iota(std::begin(indices_), std::end(indices_), 0);  // fill like arange
+    out_.data.resize(2);  // label required by DataBatch, we can use fake label here
+    out_.data[1] = TBlob(indices_.data(), TShape({1, }), cpu::kDevMask, 0);
+  }
+
+  virtual void BeforeFirst(void) {
+    pos_ = 0;
+  }
+
+  virtual int64_t GetLenHint(void) const {
+    return static_cast<int64_t>(indices_.size());
+  }
+
+  virtual bool Next(void) {
+    if (pos_ < indices_.size()) {
+      int64_t *ptr = indices_.data() + pos_;
+      out_.data[0] = TBlob(ptr, TShape({1, }), cpu::kDevMask, 0);
+      ++pos_;
+      return true;
+    }
+    return false;
+  }
+
+  virtual const DataInst &Value(void) const {
+    return out_;
+  }
+
+ private:
+  /*! \brief Stored integer indices */
+  std::vector<int64_t> indices_;
+  /*! \brief current position for iteration */
+  std::size_t pos_;
+  /*! \brief data for next value */
+  DataInst out_;
+  /*! \brief arguments */
+  SequentialSamplerParam param_;
+};  // class SequentialSampler
+
+MXNET_REGISTER_IO_ITER(SequentialSampler)
+.describe(R"code(Returns the sequential sampler iterator.
+)code" ADD_FILELINE)
+.add_arguments(SequentialSamplerParam::__FIELDS__())
+.add_arguments(BatchSamplerParam::__FIELDS__())
+.set_body([]() {
+    return
+        new BatchSampler(
+            new SequentialSampler());
+  });
+
+struct RandomSamplerParam : public dmlc::Parameter<RandomSamplerParam> {
+  /*! \brief Length of the sequence. */
+  size_t length;
+  /*! \brief Random seed.*/
+  int seed;
+  // declare parameters
+  DMLC_DECLARE_PARAMETER(RandomSamplerParam) {
+      DMLC_DECLARE_FIELD(length)
+          .describe("Length of the sequence.");
+      DMLC_DECLARE_FIELD(seed).set_default(0)
+          .describe("Random seed.");
+  }
+};  // struct RandomSamplerParam
+
+DMLC_REGISTER_PARAMETER(RandomSamplerParam);
+
+class RandomSampler : public IIterator<DataInst> {
+ public:
+  virtual void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) {
+    param_.InitAllowUnknown(kwargs);
+    indices_.resize(param_.length);
+    std::iota(std::begin(indices_), std::end(indices_), 0);  // fill like arange
+    rng_.reset(new common::RANDOM_ENGINE(kRandMagic + param_.seed));
+    out_.data.resize(2);  // label required by DataBatch, we can use fake label here
+    out_.data[1] = TBlob(indices_.data(), TShape({1, }), cpu::kDevMask, 0);
+    BeforeFirst();
+  }
+
+  virtual void BeforeFirst(void) {
+    std::shuffle(std::begin(indices_), std::end(indices_), *rng_);
+    pos_ = 0;
+  }
+
+  virtual int64_t GetLenHint(void) const {
+    return static_cast<int64_t>(indices_.size());
+  }
+
+  virtual bool Next(void) {
+    if (pos_ < indices_.size()) {
+      int64_t *ptr = indices_.data() + pos_;
+      out_.data[0] = TBlob(ptr, TShape({1, }), cpu::kDevMask, 0);
+      ++pos_;
+      return true;
+    }
+    return false;
+  }
+
+  virtual const DataInst &Value(void) const {
+    return out_;
+  }
+ private:
+  /*! \brief random magic number */
+  static const int kRandMagic = 2333;
+  /*! \brief Stored integer indices */
+  std::vector<int64_t> indices_;
+  /*! \brief current position for iteration */
+  std::size_t pos_;
+  /*! \brief data for next value */
+  DataInst out_;
+  /*! \brief random generator engine */
+  std::unique_ptr<common::RANDOM_ENGINE> rng_;
+  /*! \brief arguments */
+  RandomSamplerParam param_;
+};  // class RandomSampler
+
+MXNET_REGISTER_IO_ITER(RandomSampler)
+.describe(R"code(Returns the random sampler iterator.
+)code" ADD_FILELINE)
+.add_arguments(RandomSamplerParam::__FIELDS__())
+.add_arguments(BatchSamplerParam::__FIELDS__())
+.set_body([]() {
+    return new BatchSampler(
+            new RandomSampler());
+  });
+
+}  // namespace io
+}  // namespace mxnet
diff --git a/src/operator/image/crop-inl.h b/src/operator/image/crop-inl.h
index c13049685dea..39f72c0be5ea 100644
--- a/src/operator/image/crop-inl.h
+++ b/src/operator/image/crop-inl.h
@@ -104,6 +104,8 @@ inline void CropImpl(int x,
                       const OpContext &ctx,
                       const std::vector<OpReqType> &req) {
   using namespace mshadow;
+  CHECK_GT(width, 0) << "width <= 0";
+  CHECK_GT(height, 0) << "height <= 0";
   const TBlob& data = inputs[0];
   const TBlob& out = outputs[0];
   MXNET_NDIM_SWITCH(data.ndim(), ndim, {
@@ -193,6 +195,310 @@ inline void CropOpBackward(const nnvm::NodeAttrs &attrs,
   const CropParam& param = nnvm::get<CropParam>(attrs.parsed);
   CropBackwardImpl<xpu>(param.x, param.y, param.width, param.height, inputs, outputs, ctx, req);
 }
+
+struct RandomCropParam : public dmlc::Parameter<RandomCropParam> {
+  Tuple<float> xrange;
+  Tuple<float> yrange;
+  int width;
+  int height;
+  int interp;
+  DMLC_DECLARE_PARAMETER(RandomCropParam) {
+    DMLC_DECLARE_FIELD(xrange).set_default(Tuple<float>({0.f, 1.f}))
+    .describe("Left boundaries of the cropping area.");
+    DMLC_DECLARE_FIELD(yrange).set_default(Tuple<float>({0.f, 1.f}))
+    .describe("Top boundaries of the cropping area.");
+    DMLC_DECLARE_FIELD(width)
+    .describe("The target image width");
+    DMLC_DECLARE_FIELD(height)
+    .describe("The target image height.");
+    DMLC_DECLARE_FIELD(interp)
+    .set_default(1)
+    .describe("Interpolation method for resizing. By default uses bilinear interpolation"
+        "Options are INTER_NEAREST - a nearest-neighbor interpolation"
+        "INTER_LINEAR - a bilinear interpolation"
+        "INTER_AREA - resampling using pixel area relation"
+        "INTER_CUBIC - a bicubic interpolation over 4x4 pixel neighborhood"
+        "INTER_LANCZOS4 - a Lanczos interpolation over 8x8 pixel neighborhood"
+        "Note that the GPU version only support bilinear interpolation(1)");
+  }
+};
+
+inline Tuple<int> GetSourceSize(const TShape& in_shape) {
+  Tuple<int> ret;
+  if (in_shape.ndim() == 3) {
+    ret = Tuple<int>({static_cast<int>(in_shape[W]), static_cast<int>(in_shape[H])});
+  } else if (in_shape.ndim() == 4) {
+    ret = Tuple<int>({static_cast<int>(in_shape[kW]), static_cast<int>(in_shape[kH])});
+  } else {
+    LOG(FATAL) << "Image RandomCrop expects inputs of 3D (h, w, c) or 4D (n, h, w, c). But got "
+      << in_shape.ndim();
+  }
+  return ret;
+}
+
+inline Tuple<int> ScaleDown(const Tuple<int>& src_shape, const Tuple<int>& shape) {
+  float sw = src_shape[0];
+  float sh = src_shape[1];
+  float w = shape[0];
+  float h = shape[1];
+  if (sh < h) {
+    w = w * sh / h;
+    h = sh;
+  }
+  if (sw < w) {
+    w = sw;
+    h = h * sw / w;
+  }
+  return Tuple<int>({static_cast<int>(w), static_cast<int>(h)});
+}
+
+inline bool RandomCropShape(const nnvm::NodeAttrs& attrs,
+                             std::vector<TShape> *in_attrs,
+                             std::vector<TShape> *out_attrs) {
+  // input attrs should only be (h, w, c) or (n, h, w, c)
+  if (in_attrs->at(0).ndim() == 3U) {
+    CHECK((in_attrs->at(0)[2] == 1) || (in_attrs->at(0)[2] == 3))
+      << "Expect channel of the input image is 1 or 3, but got"
+      << in_attrs->at(0)[2];
+  } else if (in_attrs->at(0).ndim() == 4U) {
+    CHECK((in_attrs->at(0)[3] == 1) || (in_attrs->at(0)[3] == 3))
+      << "Expect channel of the input image is 1 or 3, but got"
+      << in_attrs->at(0)[3];
+  } else {
+    LOG(FATAL) << "Image RandomCrop expects inputs of 3D (h, w, c) or 4D (n, h, w, c). But got "
+      << in_attrs->at(0).ndim();
+  }
+
+  const auto& ishape = (*in_attrs)[0];
+  const RandomCropParam& param = nnvm::get<RandomCropParam>(attrs.parsed);
+
+  CHECK((param.height > 0) && (param.width > 0))
+    << "Input height and width must be greater than 0";
+  CHECK((param.xrange.ndim() == 2) && (param.yrange.ndim() == 2))
+    << "Param xrange and yrange must have two values each";
+  CHECK((param.xrange[0] <= param.xrange[1]) && (param.xrange[0] >= 0) && (param.xrange[1] <= 1))
+    << "Invalid xrange, range should be within 0 and 1.0. Given: " << param.xrange;
+  CHECK((param.yrange[0] <= param.yrange[1]) && (param.yrange[0] >= 0) && (param.yrange[1] <= 1))
+    << "Invalid yrange, range should be within 0 and 1.0. Given: " << param.yrange;
+
+  // real output
+  if (ishape.ndim() == 3) {
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape({param.height, param.width, ishape[C]}));
+  } else {
+    SHAPE_ASSIGN_CHECK(*out_attrs, 0, TShape({ishape[N], param.height, param.width, ishape[kC]}));
+  }
+
+  // temp output
+  auto src_shape = GetSourceSize(ishape);
+  auto scaled_shape = ScaleDown(src_shape, Tuple<int>({param.width, param.height}));
+  if (ishape.ndim() == 3) {
+    SHAPE_ASSIGN_CHECK(*out_attrs, 1, TShape(
+      {scaled_shape[1], scaled_shape[0], ishape[C]}));
+  } else {
+    SHAPE_ASSIGN_CHECK(*out_attrs, 1, TShape(
+      {ishape[N], scaled_shape[1], scaled_shape[0], ishape[kC]}));
+  }
+  return true;
+}
+
+template<typename xpu>
+inline void RandomCropOpForward(const nnvm::NodeAttrs &attrs,
+                   const OpContext &ctx,
+                   const std::vector<TBlob> &inputs,
+                   const std::vector<OpReqType> &req,
+                   const std::vector<TBlob> &outputs) {
+  CHECK_EQ(outputs.size(), 2U) << "out, temp";
+  CHECK_EQ(inputs.size(), 1U);
+  const RandomCropParam& param = nnvm::get<RandomCropParam>(attrs.parsed);
+
+  const TShape& dshape = inputs[0].shape_;
+  Stream<cpu> *s = ctx.get_stream<cpu>();
+  Random<cpu> *prnd = ctx.requested[0].get_random<cpu, real_t>(s);
+  auto src_size = GetSourceSize(dshape);
+  auto resize_size = GetSourceSize(outputs[1].shape_);
+  // random left/top position
+  float x = std::uniform_real_distribution<float>(
+    param.xrange[0], param.xrange[1])(prnd->GetRndEngine()) * (src_size[0] - resize_size[0]);
+  float y = std::uniform_real_distribution<float>(
+    param.yrange[0], param.yrange[1])(prnd->GetRndEngine()) * (src_size[1] - resize_size[1]);
+  // write x, y, w, h to temp workspace
+  Tensor<cpu, 1> workspace = ctx.requested[1].get_space<cpu>(
+    mshadow::Shape1(4), s);
+  workspace.dptr_[0] = x;
+  workspace.dptr_[1] = y;
+  workspace.dptr_[2] = resize_size[0];
+  workspace.dptr_[3] = resize_size[1];
+  if (resize_size[0] == src_size[0] && resize_size[1] == src_size[1]) {
+    // no need to resize
+    CropImpl<xpu>(x, y, resize_size[0], resize_size[1], inputs, outputs, ctx, req);
+  } else {
+    std::vector<TBlob> hidden_outputs = {outputs[1]};
+    CropImpl<xpu>(x, y, resize_size[0], resize_size[1], inputs, hidden_outputs, ctx, req);
+    ResizeParam rparam;
+    rparam.interp = param.interp;
+    rparam.keep_ratio = false;
+    rparam.size = Tuple<int>({param.width, param.height});
+    ResizeImplWrapper<xpu>(rparam, ctx, hidden_outputs, outputs);
+  }
+}
+
+template<typename xpu>
+inline void RandomCropOpBackward(const nnvm::NodeAttrs &attrs,
+                   const OpContext &ctx,
+                   const std::vector<TBlob> &inputs,
+                   const std::vector<OpReqType> &req,
+                   const std::vector<TBlob> &outputs) {
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(inputs.size(), 2U);
+  Tensor<cpu, 1> workspace = ctx.requested[1].get_space<cpu>(
+    mshadow::Shape1(4), ctx.get_stream<cpu>());
+  auto ptr = workspace.dptr_;
+  CropBackwardImpl<xpu>(ptr[0], ptr[1], ptr[2], ptr[3], inputs, outputs, ctx, req);
+}
+
+struct RandomResizedCropParam : public dmlc::Parameter<RandomResizedCropParam> {
+  int width;
+  int height;
+  Tuple<float> area;
+  Tuple<float> ratio;
+  int interp;
+  int max_trial;
+  DMLC_DECLARE_PARAMETER(RandomResizedCropParam) {
+    DMLC_DECLARE_FIELD(width)
+    .describe("The target image width");
+    DMLC_DECLARE_FIELD(height)
+    .describe("The target image height.");
+    DMLC_DECLARE_FIELD(area).set_default(Tuple<float>({0.08f, 1.f}))
+    .describe("Range of cropping area percentage.");
+    DMLC_DECLARE_FIELD(ratio).set_default(Tuple<float>({3 / 4.f, 4 / 3.f}))
+    .describe("Range of aspect ratio of the randomly cropped area.");
+    DMLC_DECLARE_FIELD(interp)
+    .set_default(1)
+    .describe("Interpolation method for resizing. By default uses bilinear interpolation"
+        "Options are INTER_NEAREST - a nearest-neighbor interpolation"
+        "INTER_LINEAR - a bilinear interpolation"
+        "INTER_AREA - resampling using pixel area relation"
+        "INTER_CUBIC - a bicubic interpolation over 4x4 pixel neighborhood"
+        "INTER_LANCZOS4 - a Lanczos interpolation over 8x8 pixel neighborhood"
+        "Note that the GPU version only support bilinear interpolation(1)");
+    DMLC_DECLARE_FIELD(max_trial).set_default(10)
+    .describe("Max trial before fallback to center crop.");
+  }
+};
+
+inline bool RandomResizedCropShape(const nnvm::NodeAttrs& attrs,
+                                 mxnet::ShapeVector *in_attrs,
+                                 mxnet::ShapeVector *out_attrs) {
+  const RandomResizedCropParam& param = nnvm::get<RandomResizedCropParam>(attrs.parsed);
+  ResizeParam resize_param;
+  resize_param.size = mxnet::Tuple<int>({param.width, param.height});
+  resize_param.keep_ratio = false;
+  resize_param.interp = param.interp;
+  return ResizeShapeImpl(resize_param, in_attrs, out_attrs);
+}
+
+template<typename xpu>
+inline void CropResizeImpl(const OpContext &ctx,
+                           const std::vector<TBlob> &inputs,
+                           const std::vector<OpReqType> &req,
+                           const std::vector<TBlob> &outputs,
+                           int x0, int y0, int crop_width, int crop_height,
+                           int resize_width, int resize_height, int interp) {
+  auto& dshape = inputs[0].shape_;
+  Stream<xpu> *s = ctx.get_stream<xpu>();
+  CHECK(x0 >= 0 && y0 >= 0 && crop_width > 0 &&
+        crop_height > 0 && resize_width > 0 && resize_height > 0)
+    << "Invalid crop resize arguments: " << x0 << ", " << y0 << ", "
+    << crop_width << ", " << crop_height
+    << ", " << resize_width << ", " << resize_height;
+  MSHADOW_TYPE_SWITCH_WITH_BOOL(inputs[0].type_flag_, DType, {
+    if (dshape.ndim() == 3) {
+      Tensor<xpu, 3, DType> workspace = ctx.requested[1].get_space_typed<xpu, 3, DType>(
+         mshadow::Shape3(crop_height, crop_width, dshape[C]), s);
+      std::vector<TBlob> temp_out = {TBlob(workspace)};
+      CropImpl<xpu>(x0, y0, crop_width, crop_height, inputs, temp_out, ctx, req);
+      ResizeParam rparam;
+      rparam.interp = interp;
+      rparam.keep_ratio = false;
+      rparam.size = Tuple<int>({resize_width, resize_height});
+      ResizeImplWrapper<xpu>(rparam, ctx, temp_out, outputs);
+    } else if (dshape.ndim() == 4) {
+      Tensor<xpu, 4, DType> workspace = ctx.requested[1].get_space_typed<xpu, 4, DType>(
+         mshadow::Shape4(dshape[N], crop_height, crop_width, dshape[kC]), s);
+      std::vector<TBlob> temp_out = {TBlob(workspace)};
+      CropImpl<xpu>(x0, y0, crop_width, crop_height, inputs, temp_out, ctx, req);
+      ResizeParam rparam;
+      rparam.interp = interp;
+      rparam.keep_ratio = false;
+      rparam.size = Tuple<int>({resize_width, resize_height});
+      ResizeImplWrapper<xpu>(rparam, ctx, temp_out, outputs);
+    } else {
+      LOG(FATAL) << "Crop only supports image with 3 or 4 dims, given " << dshape.ndim();
+    }
+  });
+}
+
+template<typename xpu>
+inline void RandomResizedCropOpForward(const nnvm::NodeAttrs &attrs,
+                                     const OpContext &ctx,
+                                     const std::vector<TBlob> &inputs,
+                                     const std::vector<OpReqType> &req,
+                                     const std::vector<TBlob> &outputs) {
+  CHECK_EQ(outputs.size(), 1U);
+  CHECK_EQ(inputs.size(), 1U);
+  const RandomResizedCropParam& param = nnvm::get<RandomResizedCropParam>(attrs.parsed);
+  auto src_size = GetSourceSize(inputs[0].shape_);
+  int64_t src_area = src_size[0] * src_size[1];
+  Stream<cpu> *s = ctx.get_stream<cpu>();
+  Random<cpu> *prnd = ctx.requested[0].get_random<cpu, real_t>(s);
+  for (int i = 0; i < param.max_trial; ++i) {
+    float target_area = std::uniform_real_distribution<float>(
+      param.area[0], param.area[1])(prnd->GetRndEngine()) * src_area;
+    float log_ratio_low = std::log(param.ratio[0]);
+    float log_ratio_high = std::log(param.ratio[1]);
+    float new_ratio = std::exp(std::uniform_real_distribution<float>(
+      log_ratio_low, log_ratio_high)(prnd->GetRndEngine()));
+    int new_w = static_cast<int>(std::round(std::sqrt(target_area * new_ratio)));
+    int new_h = static_cast<int>(std::round(std::sqrt(target_area / new_ratio)));
+    if (new_w <= src_size[0] && new_h <= src_size[1]) {
+      int x0 = std::uniform_real_distribution<float>(0, 1)(
+        prnd->GetRndEngine()) * (src_size[0] - new_w);
+      int y0 = std::uniform_real_distribution<float>(0, 1)(
+        prnd->GetRndEngine()) * (src_size[1] - new_h);
+      if (new_w == param.width && new_h == param.height) {
+        // no need to resize
+        CropImpl<xpu>(x0, y0, new_w, new_h, inputs, outputs, ctx, req);
+      } else {
+        CropResizeImpl<xpu>(ctx, inputs, req, outputs, x0, y0,
+                            new_w, new_h, param.width, param.height, param.interp);
+      }
+      return;
+    }
+  }
+  // fallback to center crop
+  auto scaled_shape = ScaleDown(src_size, Tuple<int>({param.width, param.height}));
+  int x0 = (param.width - scaled_shape[0]) / 2;
+  int y0 = (param.height - scaled_shape[1]) / 2;
+  CHECK(x0 >= 0 && y0 >= 0) << "Invalid center crop: " << x0 << ", " << y0;
+  if (scaled_shape[0] == param.width && scaled_shape[1] == param.height) {
+    // no need to resize
+    CropImpl<xpu>(x0, y0, scaled_shape[0], scaled_shape[1], inputs, outputs, ctx, req);
+  } else {
+    CropResizeImpl<xpu>(ctx, inputs, req, outputs, x0, y0,
+                        scaled_shape[0], scaled_shape[1],
+                        param.width, param.height, param.interp);
+  }
+}
+
+template<typename xpu>
+inline void RandomResizedCropOpBackward(const nnvm::NodeAttrs &attrs,
+                                      const OpContext &ctx,
+                                      const std::vector<TBlob> &inputs,
+                                      const std::vector<OpReqType> &req,
+                                      const std::vector<TBlob> &outputs) {
+  CHECK_EQ(outputs.size(), 1U);
+  LOG(FATAL) << "Backward for RandomSizedCropOp not implemented";
+}
 }  // namespace image
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/image/crop.cc b/src/operator/image/crop.cc
index 9a7aad38b486..ba31bf4c4c0e 100644
--- a/src/operator/image/crop.cc
+++ b/src/operator/image/crop.cc
@@ -33,10 +33,12 @@ namespace op {
 namespace image {
 
 DMLC_REGISTER_PARAMETER(CropParam);
+DMLC_REGISTER_PARAMETER(RandomCropParam);
+DMLC_REGISTER_PARAMETER(RandomResizedCropParam);
 
 NNVM_REGISTER_OP(_image_crop)
 .add_alias("_npx__image_crop")
-.describe(R"code(Crop an image NDArray of shape (H x W x C) or (N x H x W x C) 
+.describe(R"code(Crop an image NDArray of shape (H x W x C) or (N x H x W x C)
 to the given size.
 Example:
     .. code-block:: python
@@ -49,7 +51,7 @@ to the given size.
               [177  25  15]]]
             <NDArray 2x2x3 @cpu(0)>
         image = mx.nd.random.uniform(0, 255, (2, 4, 2, 3)).astype(dtype=np.uint8)
-        mx.nd.image.crop(image, 1, 1, 2, 2)            
+        mx.nd.image.crop(image, 1, 1, 2, 2)
             [[[[ 35 198  50]
                [242  94 168]]
 
@@ -81,6 +83,74 @@ NNVM_REGISTER_OP(_backward_image_crop)
 .set_attr<nnvm::TIsBackward>("TIsBackward", true)
 .set_attr<FCompute>("FCompute<cpu>", CropOpBackward<cpu>);
 
+NNVM_REGISTER_OP(_image_random_crop)
+.add_alias("_npx__image_random_crop")
+.describe(R"code(Randomly crop an image NDArray of shape (H x W x C) or (N x H x W x C)
+to the given size. Upsample result if `src` is smaller than `size`.
+Example:
+    .. code-block:: python
+        im = mx.nd.array(cv2.imread("flower.jpg"))
+        cropped_im, rect  = mx.nd.image.random_crop(im, (100, 100))
+        print(cropped_im)
+        <NDArray 100x100x1 @cpu(0)>
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(2)
+.set_attr_parser(ParamParser<RandomCropParam>)
+.set_attr<nnvm::FNumVisibleOutputs>(
+  "FNumVisibleOutputs", [](const NodeAttrs& attrs) { return static_cast<uint32_t>(1); })
+.set_attr<mxnet::FInferShape>("FInferShape", RandomCropShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 2>)
+.set_attr<FCompute>("FCompute<cpu>", RandomCropOpForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient", ElemwiseGradUseNone{ "_backward_random_image_crop" })
+.set_attr<FResourceRequest>("FResourceRequest",
+    [](const NodeAttrs& attrs) {
+      return std::vector<ResourceRequest>{
+        ResourceRequest::kRandom, ResourceRequest::kTempSpace};
+    })
+.add_argument("data", "NDArray-or-Symbol", "The input.")
+.add_arguments(RandomCropParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_random_image_crop)
+.set_attr_parser(ParamParser<RandomCropParam>)
+.set_num_inputs(2)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", RandomCropOpBackward<cpu>);
+
+NNVM_REGISTER_OP(_image_random_resized_crop)
+.add_alias("_npx__image_random_resized_crop")
+.describe(R"code(Randomly crop an image NDArray of shape (H x W x C) or (N x H x W x C)
+to the given size. Randomize area and aspect ratio. Upsample result if `src` is smaller than `size`.
+Example:
+    .. code-block:: python
+        im = mx.nd.array(cv2.imread("flower.jpg"))
+        cropped_im, rect  = mx.nd.image.random_resized_crop(im, (100, 100))
+        print(cropped_im)
+        <NDArray 100x100x1 @cpu(0)>
+)code" ADD_FILELINE)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr_parser(ParamParser<RandomResizedCropParam>)
+.set_attr<mxnet::FInferShape>("FInferShape", RandomResizedCropShape)
+.set_attr<nnvm::FInferType>("FInferType", ElemwiseType<1, 1>)
+.set_attr<FCompute>("FCompute<cpu>", RandomResizedCropOpForward<cpu>)
+.set_attr<nnvm::FGradient>("FGradient",
+                           ElemwiseGradUseNone{ "_backward_random_resized_image_crop" })
+.set_attr<FResourceRequest>("FResourceRequest",
+    [](const NodeAttrs& attrs) {
+      return std::vector<ResourceRequest>{
+        ResourceRequest::kRandom, ResourceRequest::kTempSpace};
+    })
+.add_argument("data", "NDArray-or-Symbol", "The input.")
+.add_arguments(RandomCropParam::__FIELDS__());
+
+NNVM_REGISTER_OP(_backward_random_resized_image_crop)
+.set_attr_parser(ParamParser<RandomResizedCropParam>)
+.set_num_inputs(1)
+.set_num_outputs(1)
+.set_attr<nnvm::TIsBackward>("TIsBackward", true)
+.set_attr<FCompute>("FCompute<cpu>", RandomResizedCropOpBackward<cpu>);
 }  // namespace image
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/image/crop.cu b/src/operator/image/crop.cu
index 71fde06dacc0..a4a05f55d5d6 100644
--- a/src/operator/image/crop.cu
+++ b/src/operator/image/crop.cu
@@ -29,6 +29,17 @@ NNVM_REGISTER_OP(_image_crop)
 NNVM_REGISTER_OP(_backward_image_crop)
 .set_attr<FCompute>("FCompute<gpu>", CropOpBackward<gpu>);
 
+NNVM_REGISTER_OP(_image_random_crop)
+.set_attr<FCompute>("FCompute<gpu>", RandomCropOpForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_image_random_crop)
+.set_attr<FCompute>("FCompute<gpu>", RandomCropOpBackward<gpu>);
+
+NNVM_REGISTER_OP(_image_random_resized_crop)
+.set_attr<FCompute>("FCompute<gpu>", RandomResizedCropOpForward<gpu>);
+
+NNVM_REGISTER_OP(_backward_image_random_resized_crop)
+.set_attr<FCompute>("FCompute<gpu>", RandomResizedCropOpBackward<gpu>);
 }  // namespace image
 }  // namespace op
 }  // namespace mxnet
diff --git a/src/operator/image/image_random-inl.h b/src/operator/image/image_random-inl.h
index e00b255bfd30..f01b135aa209 100644
--- a/src/operator/image/image_random-inl.h
+++ b/src/operator/image/image_random-inl.h
@@ -588,6 +588,16 @@ inline void FlipTopBottom(const nnvm::NodeAttrs &attrs,
   });
 }
 
+struct RandomFlipParam : public dmlc::Parameter<RandomFlipParam> {
+  float p;
+
+  DMLC_DECLARE_PARAMETER(RandomFlipParam) {
+    DMLC_DECLARE_FIELD(p)
+    .set_default(0.5f)
+    .describe("The probablity of flipping the image.");
+  }
+};
+
 inline void RandomFlipLeftRight(
     const nnvm::NodeAttrs &attrs,
     const OpContext &ctx,
@@ -595,10 +605,12 @@ inline void RandomFlipLeftRight(
     const std::vector<OpReqType> &req,
     const std::vector<TBlob> &outputs) {
   using namespace mshadow;
+  const RandomFlipParam &param = nnvm::get<RandomFlipParam>(attrs.parsed);
   Stream<cpu> *s = ctx.get_stream<cpu>();
   Random<cpu> *prnd = ctx.requested[0].get_random<cpu, float>(s);
+  std::normal_distribution<float> dist(0, 1);
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    if (std::bernoulli_distribution()(prnd->GetRndEngine())) {
+    if (dist(prnd->GetRndEngine()) > param.p) {
       if (outputs[0].dptr_ != inputs[0].dptr_) {
         std::memcpy(outputs[0].dptr_, inputs[0].dptr_, inputs[0].Size() * sizeof(DType));
       }
@@ -616,10 +628,12 @@ inline void RandomFlipTopBottom(
     const std::vector<OpReqType> &req,
     const std::vector<TBlob> &outputs) {
   using namespace mshadow;
+  const RandomFlipParam &param = nnvm::get<RandomFlipParam>(attrs.parsed);
   Stream<cpu> *s = ctx.get_stream<cpu>();
   Random<cpu> *prnd = ctx.requested[0].get_random<cpu, float>(s);
+  std::normal_distribution<float> dist(0, 1);
   MSHADOW_TYPE_SWITCH(outputs[0].type_flag_, DType, {
-    if (std::bernoulli_distribution()(prnd->GetRndEngine())) {
+    if (dist(prnd->GetRndEngine()) > param.p) {
       if (outputs[0].dptr_ != inputs[0].dptr_) {
         std::memcpy(outputs[0].dptr_, inputs[0].dptr_, inputs[0].Size() * sizeof(DType));
       }
diff --git a/src/operator/image/image_random.cc b/src/operator/image/image_random.cc
index aa387e683bfd..c7b71317b7c3 100644
--- a/src/operator/image/image_random.cc
+++ b/src/operator/image/image_random.cc
@@ -33,6 +33,7 @@ namespace op {
 namespace image {
 
 DMLC_REGISTER_PARAMETER(NormalizeParam);
+DMLC_REGISTER_PARAMETER(RandomFlipParam);
 DMLC_REGISTER_PARAMETER(RandomEnhanceParam);
 DMLC_REGISTER_PARAMETER(AdjustLightingParam);
 DMLC_REGISTER_PARAMETER(RandomLightingParam);
@@ -197,7 +198,9 @@ MXNET_REGISTER_IMAGE_AUG_OP(_image_flip_left_right)
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_flip_left_right)
 .add_alias("_npx__image_random_flip_left_right")
+.set_attr_parser(ParamParser<RandomFlipParam>)
 .describe(R"code()code" ADD_FILELINE)
+.add_arguments(RandomFlipParam::__FIELDS__())
 .set_attr<FCompute>("FCompute<cpu>", RandomFlipLeftRight);
 
 MXNET_REGISTER_IMAGE_AUG_OP(_image_flip_top_bottom)
@@ -207,7 +210,9 @@ MXNET_REGISTER_IMAGE_AUG_OP(_image_flip_top_bottom)
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_flip_top_bottom)
 .add_alias("_npx__image_random_flip_top_bottom")
+.set_attr_parser(ParamParser<RandomFlipParam>)
 .describe(R"code()code" ADD_FILELINE)
+.add_arguments(RandomFlipParam::__FIELDS__())
 .set_attr<FCompute>("FCompute<cpu>", RandomFlipTopBottom);
 
 MXNET_REGISTER_IMAGE_RND_AUG_OP(_image_random_brightness)
diff --git a/src/operator/image/resize-inl.h b/src/operator/image/resize-inl.h
index aa381811195d..b6f64d40ca82 100644
--- a/src/operator/image/resize-inl.h
+++ b/src/operator/image/resize-inl.h
@@ -109,15 +109,14 @@ inline SizeParam GetHeightAndWidth(int data_h,
   return SizeParam(resized_h, resized_w);
 }
 
-inline bool ResizeShape(const nnvm::NodeAttrs& attrs,
-                             mxnet::ShapeVector *in_attrs,
-                             mxnet::ShapeVector *out_attrs) {
+inline bool ResizeShapeImpl(const ResizeParam& param,
+                            mxnet::ShapeVector *in_attrs,
+                            mxnet::ShapeVector *out_attrs) {
   // input attrs should only be (h, w, c) or (n, h, w, c)
   CHECK((in_attrs->at(0).ndim() == 3U) || (in_attrs->at(0).ndim() == 4U))
     << "Input image dimension should be 3 or 4 but got "
     << in_attrs->at(0).ndim();
   const auto& ishape = (*in_attrs)[0];
-  const ResizeParam& param = nnvm::get<ResizeParam>(attrs.parsed);
   SizeParam size;
   if (ishape.ndim() == 3) {
     size = GetHeightAndWidth(ishape[H], ishape[W], param);
@@ -130,6 +129,13 @@ inline bool ResizeShape(const nnvm::NodeAttrs& attrs,
   return true;
 }
 
+inline bool ResizeShape(const nnvm::NodeAttrs& attrs,
+                             mxnet::ShapeVector *in_attrs,
+                             mxnet::ShapeVector *out_attrs) {
+  const ResizeParam& param = nnvm::get<ResizeParam>(attrs.parsed);
+  return ResizeShapeImpl(param, in_attrs, out_attrs);
+}
+
 inline void ResizeImpl(const std::vector<TBlob> &inputs,
                       const std::vector<TBlob> &outputs,
                       const int height,
@@ -168,13 +174,10 @@ inline void ResizeImpl(const std::vector<TBlob> &inputs,
 }
 
 template <typename xpu>
-inline void Resize(const nnvm::NodeAttrs &attrs,
-                   const OpContext &ctx,
-                   const std::vector<TBlob> &inputs,
-                   const std::vector<OpReqType> &req,
-                   const std::vector<TBlob> &outputs) {
-  CHECK_EQ(outputs.size(), 1U);
-  const ResizeParam& param = nnvm::get<ResizeParam>(attrs.parsed);
+inline void ResizeImplWrapper(const ResizeParam& param,
+                              const OpContext &ctx,
+                              const std::vector<TBlob> &inputs,
+                              const std::vector<TBlob> &outputs) {
   SizeParam size;
   if (std::is_same<xpu, gpu>::value) {
 #if MXNET_USE_CUDA
@@ -208,6 +211,17 @@ inline void Resize(const nnvm::NodeAttrs &attrs,
   }
 }
 
+template <typename xpu>
+inline void Resize(const nnvm::NodeAttrs &attrs,
+                   const OpContext &ctx,
+                   const std::vector<TBlob> &inputs,
+                   const std::vector<OpReqType> &req,
+                   const std::vector<TBlob> &outputs) {
+  CHECK_EQ(outputs.size(), 1U);
+  const ResizeParam& param = nnvm::get<ResizeParam>(attrs.parsed);
+  ResizeImplWrapper<xpu>(param, ctx, inputs, outputs);
+}
+
 }  // namespace image
 }  // namespace op
 }  // namespace mxnet
diff --git a/tests/python/unittest/test_contrib_gluon_data_vision.py b/tests/python/unittest/test_contrib_gluon_data_vision.py
new file mode 100644
index 000000000000..9a95d202bab7
--- /dev/null
+++ b/tests/python/unittest/test_contrib_gluon_data_vision.py
@@ -0,0 +1,151 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+import numpy as np
+import scipy.ndimage
+from mxnet.test_utils import *
+from common import assertRaises, with_seed
+import shutil
+import tempfile
+import unittest
+
+from nose.tools import raises
+
+def _get_data(url, dirname):
+    import os, tarfile
+    download(url, dirname=dirname, overwrite=False)
+    fname = os.path.join(dirname, url.split('/')[-1])
+    tar = tarfile.open(fname)
+    source_images = [os.path.join(dirname, x.name) for x in tar.getmembers() if x.isfile()]
+    if len(source_images) < 1 or not os.path.isfile(source_images[0]):
+        # skip extracting if exists
+        tar.extractall(path=dirname)
+    tar.close()
+    return source_images
+
+def _generate_objects():
+    num = np.random.randint(1, 10)
+    xy = np.random.rand(num, 2)
+    wh = np.random.rand(num, 2) / 2
+    left = (xy[:, 0] - wh[:, 0])[:, np.newaxis]
+    right = (xy[:, 0] + wh[:, 0])[:, np.newaxis]
+    top = (xy[:, 1] - wh[:, 1])[:, np.newaxis]
+    bot = (xy[:, 1] + wh[:, 1])[:, np.newaxis]
+    boxes = np.maximum(0., np.minimum(1., np.hstack((left, top, right, bot))))
+    cid = np.random.randint(0, 20, size=num)
+    label = np.hstack((cid[:, np.newaxis], boxes)).ravel().tolist()
+    return [2, 5] + label
+
+
+class TestImage(unittest.TestCase):
+    IMAGES_URL = "http://data.mxnet.io/data/test_images.tar.gz"
+    IMAGES = []
+    IMAGES_DIR = None
+
+    @classmethod
+    def setupClass(cls):
+        cls.IMAGES_DIR = tempfile.mkdtemp()
+        cls.IMAGES = _get_data(cls.IMAGES_URL, cls.IMAGES_DIR)
+        print("Loaded {} images".format(len(cls.IMAGES)))
+
+    @classmethod
+    def teardownClass(cls):
+        if cls.IMAGES_DIR:
+            print("cleanup {}".format(cls.IMAGES_DIR))
+            shutil.rmtree(cls.IMAGES_DIR)
+
+    def test_imageiter(self):
+        im_list = [[np.random.randint(0, 5), x] for x in TestImage.IMAGES]
+        os.makedirs('./data', exist_ok=True)
+        fname = './data/test_imageiter.lst'
+        file_list = ['\t'.join([str(k), str(np.random.randint(0, 5)), x])
+                        for k, x in enumerate(TestImage.IMAGES)]
+        with open(fname, 'w') as f:
+            for line in file_list:
+                f.write(line + '\n')
+
+        test_list = ['imglist', 'path_imglist']
+        for dtype in ['int32', 'float32', 'int64', 'float64']:
+            for test in test_list:
+                imglist = im_list if test == 'imglist' else None
+                path_imglist = fname if test == 'path_imglist' else None
+                imageiter_list = [
+                    mx.gluon.contrib.data.vision.ImageDataLoader(2, (3, 224, 224), imglist=imglist,
+                        path_imglist=path_imglist, path_root='', dtype=dtype),
+                    mx.gluon.contrib.data.vision.ImageDataLoader(3, (3, 224, 224), imglist=imglist,
+                        path_imglist=path_imglist, path_root='', dtype=dtype, last_batch='discard'),
+                    mx.gluon.contrib.data.vision.ImageDataLoader(3, (3, 224, 224), imglist=imglist,
+                        path_imglist=path_imglist, path_root='', dtype=dtype, last_batch='keep'),
+                    mx.gluon.contrib.data.vision.ImageDataLoader(3, (3, 224, 224), imglist=imglist,
+                        path_imglist=path_imglist, path_root='', dtype=dtype, last_batch='rollover'),
+                    mx.gluon.contrib.data.vision.ImageDataLoader(3, (3, 224, 224), imglist=imglist, shuffle=True,
+                        path_imglist=path_imglist, path_root='', dtype=dtype, last_batch='keep',
+                        rand_crop=1, rand_gray=0.1, rand_mirror=True)
+                ]
+                for it in imageiter_list:
+                    for batch in it:
+                        pass
+
+    def test_image_bbox_iter(self):
+        im_list = [_generate_objects() + [x] for x in TestImage.IMAGES]
+        det_iter = mx.gluon.contrib.data.vision.ImageBboxDataLoader(2, (3, 300, 300), imglist=im_list, path_root='')
+        for _ in range(3):
+            for _ in det_iter:
+                pass
+        val_iter = mx.gluon.contrib.data.vision.ImageBboxDataLoader(2, (3, 300, 300), imglist=im_list, path_root='')
+
+        # test batch_size is not divisible by number of images
+        det_iter = mx.gluon.contrib.data.vision.ImageBboxDataLoader(4, (3, 300, 300), imglist=im_list, path_root='')
+        for _ in det_iter:
+            pass
+
+        # test file list with last batch handle
+        os.makedirs('./data', exist_ok=True)
+        fname = './data/test_imagedetiter.lst'
+        im_list = [[k] + _generate_objects() + [x] for k, x in enumerate(TestImage.IMAGES)]
+        with open(fname, 'w') as f:
+            for line in im_list:
+                line = '\t'.join([str(k) for k in line])
+                f.write(line + '\n')
+
+        imageiter_list = [
+            mx.gluon.contrib.data.vision.ImageBboxDataLoader(2, (3, 400, 400),
+                path_imglist=fname, path_root=''),
+            mx.gluon.contrib.data.vision.ImageBboxDataLoader(3, (3, 400, 400),
+                path_imglist=fname, path_root='', last_batch='discard'),
+            mx.gluon.contrib.data.vision.ImageBboxDataLoader(3, (3, 400, 400),
+                path_imglist=fname, path_root='', last_batch='keep'),
+            mx.gluon.contrib.data.vision.ImageBboxDataLoader(3, (3, 400, 400),
+                path_imglist=fname, path_root='', last_batch='rollover'),
+            mx.gluon.contrib.data.vision.ImageBboxDataLoader(3, (3, 400, 400), shuffle=True,
+                path_imglist=fname, path_root='', last_batch='keep')
+        ]
+
+
+    def test_bbox_augmenters(self):
+        # only test if all augmenters will work
+        # TODO(Joshua Zhang): verify the augmenter outputs
+        im_list = [_generate_objects() + [x] for x in TestImage.IMAGES]
+        det_iter = mx.gluon.contrib.data.vision.ImageBboxDataLoader(2, (3, 300, 300), imglist=im_list, path_root='',
+            rand_crop=1, rand_pad=1, rand_gray=0.1, rand_mirror=True, mean=True,
+            std=[1.1, 1.03, 1.05], brightness=0.1, contrast=0.1, saturation=0.1,
+            pca_noise=0.1, hue=0.1, inter_method=10,
+            max_aspect_ratio=5, area_range=(0.1, 4.0),
+            max_attempts=50)
+        for batch in det_iter:
+            pass
diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index e6e3caebdf16..ab9d448f6044 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -17,6 +17,7 @@
 
 import os
 import tarfile
+import tempfile
 import unittest
 import mxnet as mx
 import numpy as np
@@ -53,7 +54,7 @@ def prepare_record():
     if not os.path.isdir("data/test_images/test_images"):
         gluon.utils.download("http://data.mxnet.io/data/test_images.tar.gz", "data/test_images.tar.gz")
         tarfile.open('data/test_images.tar.gz').extractall('data/test_images/')
-    if not os.path.exists('data/test.rec'):
+    if not os.path.exists('data/test.rec') or not os.path.exists('data/test.idx'):
         imgs = os.listdir('data/test_images/test_images')
         record = mx.recordio.MXIndexedRecordIO('data/test.idx', 'data/test.rec', 'w')
         for i, img in enumerate(imgs):
@@ -74,6 +75,20 @@ def test_recordimage_dataset():
         assert x.shape[0] == 1 and x.shape[3] == 3
         assert y.asscalar() == i
 
+@with_seed()
+def test_recordimage_dataset_handle():
+    recfile = prepare_record()
+    class TmpTransform(mx.gluon.HybridBlock):
+        def hybrid_forward(self, F, x):
+            return x
+    fn = TmpTransform()
+    dataset = gluon.data.vision.ImageRecordDataset(recfile).transform_first(fn).__mx_handle__()
+    loader = gluon.data.DataLoader(dataset, 1)
+
+    for i, (x, y) in enumerate(loader):
+        assert x.shape[0] == 1 and x.shape[3] == 3
+        assert y.asscalar() == i
+
 def _dataset_transform_fn(x, y):
     """Named transform function since lambda function cannot be pickled."""
     return x, y
@@ -86,7 +101,7 @@ def _dataset_transform_first_fn(x):
 def test_recordimage_dataset_with_data_loader_multiworker():
     recfile = prepare_record()
     dataset = gluon.data.vision.ImageRecordDataset(recfile)
-    loader = gluon.data.DataLoader(dataset, 1, num_workers=5)
+    loader = gluon.data.DataLoader(dataset, 1, num_workers=5, try_nopython=True)
 
     for i, (x, y) in enumerate(loader):
         assert x.shape[0] == 1 and x.shape[3] == 3
@@ -94,7 +109,7 @@ def test_recordimage_dataset_with_data_loader_multiworker():
 
     # with transform
     dataset = gluon.data.vision.ImageRecordDataset(recfile).transform(_dataset_transform_fn)
-    loader = gluon.data.DataLoader(dataset, 1, num_workers=5)
+    loader = gluon.data.DataLoader(dataset, 1, num_workers=5, try_nopython=False)
 
     for i, (x, y) in enumerate(loader):
         assert x.shape[0] == 1 and x.shape[3] == 3
@@ -102,7 +117,7 @@ def test_recordimage_dataset_with_data_loader_multiworker():
 
     # with transform_first
     dataset = gluon.data.vision.ImageRecordDataset(recfile).transform_first(_dataset_transform_first_fn)
-    loader = gluon.data.DataLoader(dataset, 1, num_workers=5)
+    loader = gluon.data.DataLoader(dataset, 1, num_workers=5, try_nopython=False)
 
     for i, (x, y) in enumerate(loader):
         assert x.shape[0] == 1 and x.shape[3] == 3
@@ -133,6 +148,18 @@ def test_datasets():
     assert len(gluon.data.vision.CIFAR100(root='data/cifar100', fine_label=True)) == 50000
     assert len(gluon.data.vision.CIFAR100(root='data/cifar100', train=False)) == 10000
 
+@with_seed()
+def test_datasets_handles():
+    assert len(gluon.data.vision.MNIST(root='data/mnist').__mx_handle__()) == 60000
+    assert len(gluon.data.vision.MNIST(root='data/mnist', train=False).__mx_handle__()) == 10000
+    assert len(gluon.data.vision.FashionMNIST(root='data/fashion-mnist').__mx_handle__()) == 60000
+    assert len(gluon.data.vision.FashionMNIST(root='data/fashion-mnist', train=False).__mx_handle__()) == 10000
+    assert len(gluon.data.vision.CIFAR10(root='data/cifar10').__mx_handle__()) == 50000
+    assert len(gluon.data.vision.CIFAR10(root='data/cifar10', train=False).__mx_handle__()) == 10000
+    assert len(gluon.data.vision.CIFAR100(root='data/cifar100').__mx_handle__()) == 50000
+    assert len(gluon.data.vision.CIFAR100(root='data/cifar100', fine_label=True).__mx_handle__()) == 50000
+    assert len(gluon.data.vision.CIFAR100(root='data/cifar100', train=False).__mx_handle__()) == 10000
+
 @with_seed()
 def test_image_folder_dataset():
     prepare_record()
@@ -140,6 +167,63 @@ def test_image_folder_dataset():
     assert dataset.synsets == ['test_images']
     assert len(dataset.items) == 16
 
+@with_seed()
+def test_image_folder_dataset_handle():
+    prepare_record()
+    dataset = gluon.data.vision.ImageFolderDataset('data/test_images')
+    hd = dataset.__mx_handle__()
+    assert len(hd) == 16
+    assert (hd[1][0] == dataset[1][0]).asnumpy().all()
+    assert hd[5][1] == dataset[5][1]
+
+@with_seed()
+def test_image_list_dataset():
+    prepare_record()
+    imlist = os.listdir('data/test_images/test_images')
+    imglist = [(0, path) for i, path in enumerate(imlist)]
+    dataset = gluon.data.vision.ImageListDataset(root='data/test_images/test_images', imglist=imglist)
+    assert len(dataset) == 16, len(dataset)
+    img, label = dataset[0]
+    assert len(img.shape) == 3
+    assert label == 0
+
+    # save to file as *.lst
+    imglist = ['\t'.join((str(i), '0', path)) for i, path in enumerate(imlist)]
+    with tempfile.NamedTemporaryFile('wt', delete=False) as fp:
+        for line in imglist:
+            fp.write(line + '\n')
+        fp.close()
+
+        dataset = gluon.data.vision.ImageListDataset(root='data/test_images/test_images', imglist=fp.name)
+        assert len(dataset) == 16, len(dataset)
+        img, label = dataset[0]
+        assert len(img.shape) == 3
+        assert label == 0
+
+@with_seed()
+def test_image_list_dataset_handle():
+    prepare_record()
+    imlist = os.listdir('data/test_images/test_images')
+    imglist = [(0, path) for i, path in enumerate(imlist)]
+    dataset = gluon.data.vision.ImageListDataset(root='data/test_images/test_images', imglist=imglist).__mx_handle__()
+    assert len(dataset) == 16, len(dataset)
+    img, label = dataset[0]
+    assert len(img.shape) == 3
+    assert label == 0
+
+    # save to file as *.lst
+    imglist = ['\t'.join((str(i), '0', path)) for i, path in enumerate(imlist)]
+    with tempfile.NamedTemporaryFile('wt', delete=False) as fp:
+        for line in imglist:
+            fp.write(line + '\n')
+        fp.close()
+
+        dataset = gluon.data.vision.ImageListDataset(root='data/test_images/test_images', imglist=fp.name).__mx_handle__()
+        assert len(dataset) == 16
+        img, label = dataset[0]
+        assert len(img.shape) == 3
+        assert label == 0
+
 @with_seed()
 def test_list_dataset():
     for num_worker in range(0, 3):
@@ -148,7 +232,7 @@ def test_list_dataset():
             pass
 
 
-class Dataset(gluon.data.Dataset):
+class _Dataset(gluon.data.Dataset):
     def __len__(self):
         return 100
     def __getitem__(self, key):
@@ -156,7 +240,7 @@ def __getitem__(self, key):
 
 @with_seed()
 def test_multi_worker():
-    data = Dataset()
+    data = _Dataset()
     for thread_pool in [True, False]:
         loader = gluon.data.DataLoader(data, batch_size=1, num_workers=5, thread_pool=thread_pool)
         for i, batch in enumerate(loader):
@@ -317,6 +401,19 @@ def test_dataset_filter():
     for idx, sample in enumerate(a_xform_filtered):
         assert sample % 10 == 0
 
+def test_dataset_filter_handle():
+    length = 100
+    a = mx.gluon.data.SimpleDataset(np.arange(length))
+    a_filtered = a.filter(lambda x: x % 10 == 0).__mx_handle__()
+    assert(len(a_filtered) == 10)
+    for idx, sample in enumerate(a_filtered):
+        assert sample % 10 == 0
+    a_xform_filtered = a.transform(lambda x: x + 1).filter(lambda x: x % 10 == 0)
+    assert(len(a_xform_filtered) == 10)
+    # the filtered data is already transformed
+    for idx, sample in enumerate(a_xform_filtered):
+        assert sample % 10 == 0
+
 def test_dataset_shard():
     length = 9
     a = mx.gluon.data.SimpleDataset([i for i in range(length)])
@@ -335,6 +432,24 @@ def test_dataset_shard():
             total += sample
     assert total == sum(a)
 
+def test_dataset_shard_handle():
+    length = 9
+    a = mx.gluon.data.SimpleDataset(np.arange(length))
+    shard_0 = a.shard(4, 0).__mx_handle__()
+    shard_1 = a.shard(4, 1).__mx_handle__()
+    shard_2 = a.shard(4, 2).__mx_handle__()
+    shard_3 = a.shard(4, 3).__mx_handle__()
+    assert len(shard_0) + len(shard_1) + len(shard_2) + len(shard_3) == length
+    assert len(shard_0) == 3
+    assert len(shard_1) == 2
+    assert len(shard_2) == 2
+    assert len(shard_3) == 2
+    total = 0
+    for shard in [shard_0, shard_1, shard_2, shard_3]:
+        for idx, sample in enumerate(shard):
+            total += sample
+    assert total == sum(a)
+
 def test_dataset_take():
     length = 100
     a = mx.gluon.data.SimpleDataset([i for i in range(length)])
@@ -361,6 +476,32 @@ def test_dataset_take():
         total += sample
     assert total == expected_total
 
+def test_dataset_take_handle():
+    length = 100
+    a = mx.gluon.data.SimpleDataset(np.arange(length))
+    a_take_full = a.take(1000).__mx_handle__()
+    assert len(a_take_full) == length
+    a_take_full = a.take(None).__mx_handle__()
+    assert len(a_take_full) == length
+    count = 10
+    a_take_10 = a.take(count).__mx_handle__()
+    assert len(a_take_10) == count
+    expected_total = sum([i for i in range(count)])
+    total = 0
+    for idx, sample in enumerate(a_take_10):
+        assert sample < count
+        total += sample
+    assert total == expected_total
+
+    a_xform_take_10 = a.take(count).__mx_handle__()
+    assert len(a_xform_take_10) == count
+    expected_total = sum([i for i in range(count)])
+    total = 0
+    for idx, sample in enumerate(a_xform_take_10):
+        assert sample < count
+        total += sample
+    assert total == expected_total
+
 def test_dataloader_scope():
     """
     Bug: Gluon DataLoader terminates the process pool early while
@@ -382,3 +523,86 @@ def test_dataloader_scope():
 
     assert item is not None
 
+def test_mx_datasets_handle():
+    # _DownloadedDataset
+    mnist = mx.gluon.data.vision.MNIST(train=False).__mx_handle__()
+    assert len(mnist) == 10000
+    cifar10 = mx.gluon.data.vision.CIFAR10(train=False).__mx_handle__()
+    assert len(cifar10) == 10000
+
+    # _SampledDataset
+    s_mnist = mnist.take(100).__mx_handle__()
+    assert len(s_mnist) == 100
+    assert np.all(s_mnist[0][0].asnumpy() == mnist[0][0].asnumpy())
+    assert s_mnist[0][1] == mnist[0][1]
+
+    # ArrayDataset
+    mc = mx.gluon.data.ArrayDataset(mnist.take(100), cifar10.take(100)).__mx_handle__()
+    assert len(mc) == 100
+    assert len(mc[0]) == 4  # two from mnist, two from cifar10
+    assert mc[0][1] == mnist[0][1]
+    assert mc[0][3] == cifar10[0][1]
+
+def test_mx_data_loader():
+    from mxnet.gluon.data.dataloader import DataLoader
+
+    dataset = mx.gluon.data.vision.MNIST(train=False)
+    dl = DataLoader(num_workers=0, dataset=dataset, batch_size=32)
+    for _ in dl:
+        pass
+
+def test_mx_data_loader_nopython():
+    from mxnet.gluon.data.dataloader import DataLoader
+    from mxnet.gluon.data.vision.transforms import ToTensor
+    dataset = mx.gluon.data.vision.MNIST(train=False)
+    dl1 = DataLoader(dataset=dataset.transform_first(ToTensor()), batch_size=32, try_nopython=True, shuffle=False)
+    dl2 = DataLoader(dataset=dataset.transform_first(ToTensor()), batch_size=32, try_nopython=False, shuffle=False)
+    assert len(dl1) == len(dl2)
+    assert np.all(next(iter(dl1))[1].asnumpy() == next(iter(dl2))[1].asnumpy())
+    for _ in dl1:
+        pass
+
+def test_batchify_stack():
+    a = np.array([[1, 2, 3, 4], [5, 6, 7, 8]])
+    b = np.array([[5, 6, 7, 8], [1, 2, 3, 4]])
+    bf = mx.gluon.data.batchify.Stack()
+    bf_handle = bf.__mx_handle__()
+    c = bf([a, b])
+    d = bf_handle([a, b])
+    assert c.shape == d.shape
+    assert mx.test_utils.almost_equal(c.asnumpy(), d.asnumpy())
+    assert mx.test_utils.almost_equal(c.asnumpy(), np.stack((a, b)))
+
+def test_batchify_pad():
+    a = np.array([[1, 2, 3, 4], [11, 12, 13, 14]])
+    b = np.array([[4, 5, 6]])
+    c = np.array([[9, 10]])
+    bf = mx.gluon.data.batchify.Pad(val=-1)
+    bf_handle = bf.__mx_handle__()
+    d = bf([a, b, c])
+    e = bf_handle([a, b, c])
+    assert d.shape == e.shape
+    assert mx.test_utils.almost_equal(d.asnumpy(), e.asnumpy())
+    expected = np.array([[[ 1.,  2.,  3.,  4.], [11., 12., 13., 14.]],
+                         [[ 4.,  5.,  6., -1.], [-1., -1., -1., -1.]],
+                         [[ 9., 10., -1., -1.], [-1., -1., -1., -1.]]])
+    assert mx.test_utils.almost_equal(d.asnumpy(), expected)
+
+def test_batchify_group():
+    a = [np.array([[1, 2, 3, 4], [5, 6, 7, 8]]), np.array([[1, 2, 3, 4], [11, 12, 13, 14]])]
+    b = [np.array([[1, 2, 3, 4], [5, 6, 7, 8]]), np.array([[4, 5, 6]])]
+    c = [np.array([[1, 2, 3, 4], [5, 6, 7, 8]]), np.array([[9, 10]])]
+    bf = mx.gluon.data.batchify.Group(mx.gluon.data.batchify.Stack(), mx.gluon.data.batchify.Pad(val=-1))
+    bf_handle = bf.__mx_handle__()
+    d = bf([a, b, c])
+    e = bf_handle([a, b, c])
+    assert d[0].shape == e[0].shape
+    assert d[1].shape == e[1].shape
+    print(d[0].asnumpy(), ',', e[0].asnumpy(), ',', e[1].asnumpy())
+    assert mx.test_utils.almost_equal(d[0].asnumpy(), e[0].asnumpy())
+    assert mx.test_utils.almost_equal(d[1].asnumpy(), e[1].asnumpy())
+    assert mx.test_utils.almost_equal(d[0].asnumpy(), np.stack((a[0], b[0], c[0])))
+    expected = np.array([[[ 1.,  2.,  3.,  4.], [11., 12., 13., 14.]],
+                         [[ 4.,  5.,  6., -1.], [-1., -1., -1., -1.]],
+                         [[ 9., 10., -1., -1.], [-1., -1., -1., -1.]]])
+    assert mx.test_utils.almost_equal(d[1].asnumpy(), expected)
diff --git a/tests/python/unittest/test_gluon_data_vision.py b/tests/python/unittest/test_gluon_data_vision.py
index d810f32b0afa..8ffb0d15952a 100644
--- a/tests/python/unittest/test_gluon_data_vision.py
+++ b/tests/python/unittest/test_gluon_data_vision.py
@@ -215,6 +215,7 @@ def test_transformer():
         transforms.Resize(300),
         transforms.Resize(300, keep_ratio=True),
         transforms.CenterCrop(256),
+        transforms.RandomCrop(256, pad=16),
         transforms.RandomResizedCrop(224),
         transforms.RandomFlipLeftRight(),
         transforms.RandomColorJitter(0.1, 0.1, 0.1, 0.1),
@@ -229,6 +230,93 @@ def test_transformer():
 
     transform(mx.nd.ones((245, 480, 3), dtype='uint8')).wait_to_read()
 
+@with_seed()
+def test_random_crop():
+    x = mx.nd.ones((245, 480, 3), dtype='uint8')
+    y = mx.nd.image.random_crop(x, width=100, height=100)
+    assert y.shape == (100, 100, 3)
+
+@with_seed()
+def test_random_resize_crop():
+    x = mx.nd.ones((245, 480, 3), dtype='uint8')
+    y = mx.nd.image.random_resized_crop(x, width=100, height=100)
+    assert y.shape == (100, 100, 3)
+
+@with_seed()
+def test_hybrid_transformer():
+    from mxnet.gluon.data.vision import transforms
+
+    transform = transforms.HybridCompose([
+        transforms.Resize(300),
+        transforms.Resize(300, keep_ratio=True),
+        transforms.CenterCrop(256),
+        transforms.RandomCrop(256, pad=16),
+        transforms.RandomFlipLeftRight(),
+        transforms.RandomColorJitter(0.1, 0.1, 0.1, 0.1),
+        transforms.RandomBrightness(0.1),
+        transforms.RandomContrast(0.1),
+        transforms.RandomSaturation(0.1),
+        transforms.RandomHue(0.1),
+        transforms.RandomLighting(0.1),
+        transforms.ToTensor(),
+        transforms.Normalize([0, 0, 0], [1, 1, 1])])
+
+    transform(mx.nd.ones((245, 480, 3), dtype='uint8')).wait_to_read()
+
+@with_seed()
+def test_rotate():
+    transformer = transforms.Rotate(10.)
+    assertRaises(TypeError, transformer, mx.nd.ones((3, 30, 60), dtype='uint8'))
+    single_image = mx.nd.ones((3, 30, 60), dtype='float32')
+    single_output = transformer(single_image)
+    assert same(single_output.shape, (3, 30, 60))
+    batch_image = mx.nd.ones((3, 3, 30, 60), dtype='float32')
+    batch_output = transformer(batch_image)
+    assert same(batch_output.shape, (3, 3, 30, 60))
+
+    input_image = nd.array([[[0., 0., 0.],
+                             [0., 0., 1.],
+                             [0., 0., 0.]]])
+    rotation_angles_expected_outs = [
+        (90., nd.array([[[0., 1., 0.],
+                         [0., 0., 0.],
+                         [0., 0., 0.]]])),
+        (180., nd.array([[[0., 0., 0.],
+                          [1., 0., 0.],
+                          [0., 0., 0.]]])),
+        (270., nd.array([[[0., 0., 0.],
+                          [0., 0., 0.],
+                          [0., 1., 0.]]])),
+        (360., nd.array([[[0., 0., 0.],
+                          [0., 0., 1.],
+                          [0., 0., 0.]]])),
+    ]
+    for rot_angle, expected_result in rotation_angles_expected_outs:
+        transformer = transforms.Rotate(rot_angle)
+        ans = transformer(input_image)
+        print(ans, expected_result)
+        assert_almost_equal(ans, expected_result, atol=1e-6)
+
+
+@with_seed()
+def test_random_rotation():
+    # test exceptions for probability input outside of [0,1]
+    assertRaises(ValueError, transforms.RandomRotation, [-10, 10.], rotate_with_proba=1.1)
+    assertRaises(ValueError, transforms.RandomRotation, [-10, 10.], rotate_with_proba=-0.3)
+    # test `forward`
+    transformer = transforms.RandomRotation([-10, 10.])
+    assertRaises(TypeError, transformer, mx.nd.ones((3, 30, 60), dtype='uint8'))
+    single_image = mx.nd.ones((3, 30, 60), dtype='float32')
+    single_output = transformer(single_image)
+    assert same(single_output.shape, (3, 30, 60))
+    batch_image = mx.nd.ones((3, 3, 30, 60), dtype='float32')
+    batch_output = transformer(batch_image)
+    assert same(batch_output.shape, (3, 3, 30, 60))
+    # test identity (rotate_with_proba = 0)
+    transformer = transforms.RandomRotation([-100., 100.], rotate_with_proba=0.0)
+    data = mx.nd.random_normal(shape=(3, 30, 60))
+    assert_almost_equal(data, transformer(data))
+
 
 @with_seed()
 def test_rotate():
@@ -301,4 +389,57 @@ def test_random_transforms():
             num_apply += 1
     assert_almost_equal(num_apply/float(iteration), 0.5, 0.1)
 
+@with_seed()
+def test_random_gray():
+    from mxnet.gluon.data.vision import transforms
+
+    transform = transforms.RandomGray(0.5)
+    img = mx.nd.ones((4, 4, 3), dtype='uint8')
+    pixel = img[0, 0, 0].asnumpy()
+    iteration = 1000
+    num_apply = 0
+    for _ in range(iteration):
+        out = transform(img)
+        if out[0][0][0].asnumpy() != pixel:
+            num_apply += 1
+    assert_almost_equal(num_apply/float(iteration), 0.5, 0.1)
+
+    transform = transforms.RandomGray(0.5)
+    transform.hybridize()
+    img = mx.nd.ones((4, 4, 3), dtype='uint8')
+    pixel = img[0, 0, 0].asnumpy()
+    iteration = 1000
+    num_apply = 0
+    for _ in range(iteration):
+        out = transform(img)
+        if out[0][0][0].asnumpy() != pixel:
+            num_apply += 1
+    assert_almost_equal(num_apply/float(iteration), 0.5, 0.1)
+
+@with_seed()
+def test_bbox_random_flip():
+    from mxnet.gluon.contrib.data.vision.transforms.bbox import ImageBboxRandomFlipLeftRight
+
+    transform = ImageBboxRandomFlipLeftRight(0.5)
+    iteration = 200
+    num_apply = 0
+    for _ in range(iteration):
+        img = mx.nd.ones((10, 10, 3), dtype='uint8')
+        img[0, 0, 0] = 10
+        bbox = mx.nd.array([[1, 2, 3, 4, 0]])
+        im_out, im_bbox = transform(img, bbox)
+        if im_bbox[0][0].asnumpy() != 1 and im_out[0, 0, 0].asnumpy() != 10:
+            num_apply += 1
+    assert_almost_equal(np.array([num_apply])/float(iteration), 0.5, 0.5)
+
+@with_seed()
+def test_bbox_crop():
+    from mxnet.gluon.contrib.data.vision.transforms.bbox import ImageBboxCrop
+
+    transform = ImageBboxCrop((0, 0, 3, 3))
+    img = mx.nd.ones((10, 10, 3), dtype='uint8')
+    bbox = mx.nd.array([[0, 1, 3, 4, 0]])
+    im_out, im_bbox = transform(img, bbox)
+    assert im_out.shape == (3, 3, 3)
+    assert im_bbox[0][2] == 3
 
diff --git a/tests/python/unittest/test_numpy_contrib_gluon_data_vision.py b/tests/python/unittest/test_numpy_contrib_gluon_data_vision.py
new file mode 100644
index 000000000000..844138fefde8
--- /dev/null
+++ b/tests/python/unittest/test_numpy_contrib_gluon_data_vision.py
@@ -0,0 +1,152 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+import mxnet as mx
+import numpy as np
+import scipy.ndimage
+from mxnet.test_utils import *
+from common import assertRaises, with_seed
+import shutil
+import tempfile
+import unittest
+
+from nose.tools import raises
+
+def _get_data(url, dirname):
+    import os, tarfile
+    download(url, dirname=dirname, overwrite=False)
+    fname = os.path.join(dirname, url.split('/')[-1])
+    tar = tarfile.open(fname)
+    source_images = [os.path.join(dirname, x.name) for x in tar.getmembers() if x.isfile()]
+    if len(source_images) < 1 or not os.path.isfile(source_images[0]):
+        # skip extracting if exists
+        tar.extractall(path=dirname)
+    tar.close()
+    return source_images
+
+def _generate_objects():
+    num = np.random.randint(1, 10)
+    xy = np.random.rand(num, 2)
+    wh = np.random.rand(num, 2) / 2
+    left = (xy[:, 0] - wh[:, 0])[:, np.newaxis]
+    right = (xy[:, 0] + wh[:, 0])[:, np.newaxis]
+    top = (xy[:, 1] - wh[:, 1])[:, np.newaxis]
+    bot = (xy[:, 1] + wh[:, 1])[:, np.newaxis]
+    boxes = np.maximum(0., np.minimum(1., np.hstack((left, top, right, bot))))
+    cid = np.random.randint(0, 20, size=num)
+    label = np.hstack((cid[:, np.newaxis], boxes)).ravel().tolist()
+    return [2, 5] + label
+
+
+class TestImage(unittest.TestCase):
+    IMAGES_URL = "http://data.mxnet.io/data/test_images.tar.gz"
+    IMAGES = []
+    IMAGES_DIR = None
+
+    @classmethod
+    def setupClass(cls):
+        cls.IMAGES_DIR = tempfile.mkdtemp()
+        cls.IMAGES = _get_data(cls.IMAGES_URL, cls.IMAGES_DIR)
+        print("Loaded {} images".format(len(cls.IMAGES)))
+
+    @classmethod
+    def teardownClass(cls):
+        if cls.IMAGES_DIR:
+            print("cleanup {}".format(cls.IMAGES_DIR))
+            shutil.rmtree(cls.IMAGES_DIR)
+
+    @use_np
+    def test_imageiter(self):
+        im_list = [[np.random.randint(0, 5), x] for x in TestImage.IMAGES]
+        fname = './data/test_imageiter.lst'
+        file_list = ['\t'.join([str(k), str(np.random.randint(0, 5)), x])
+                        for k, x in enumerate(TestImage.IMAGES)]
+        with open(fname, 'w') as f:
+            for line in file_list:
+                f.write(line + '\n')
+
+        test_list = ['imglist', 'path_imglist']
+        for dtype in ['int32', 'float32', 'int64', 'float64']:
+            for test in test_list:
+                imglist = im_list if test == 'imglist' else None
+                path_imglist = fname if test == 'path_imglist' else None
+                imageiter_list = [
+                    mx.gluon.contrib.data.vision.ImageDataLoader(2, (3, 224, 224), imglist=imglist,
+                        path_imglist=path_imglist, path_root='', dtype=dtype),
+                    mx.gluon.contrib.data.vision.ImageDataLoader(3, (3, 224, 224), imglist=imglist,
+                        path_imglist=path_imglist, path_root='', dtype=dtype, last_batch='discard'),
+                    mx.gluon.contrib.data.vision.ImageDataLoader(3, (3, 224, 224), imglist=imglist,
+                        path_imglist=path_imglist, path_root='', dtype=dtype, last_batch='keep'),
+                    mx.gluon.contrib.data.vision.ImageDataLoader(3, (3, 224, 224), imglist=imglist,
+                        path_imglist=path_imglist, path_root='', dtype=dtype, last_batch='rollover'),
+                    mx.gluon.contrib.data.vision.ImageDataLoader(3, (3, 224, 224), imglist=imglist, shuffle=True,
+                        path_imglist=path_imglist, path_root='', dtype=dtype, last_batch='keep',
+                        rand_crop=1, rand_gray=0.1, rand_mirror=True)
+                ]
+                for it in imageiter_list:
+                    for batch in it:
+                        pass
+
+    @use_np
+    def test_image_bbox_iter(self):
+        im_list = [_generate_objects() + [x] for x in TestImage.IMAGES]
+        det_iter = mx.gluon.contrib.data.vision.ImageBboxDataLoader(2, (3, 300, 300), imglist=im_list, path_root='')
+        for _ in range(3):
+            for _ in det_iter:
+                pass
+        val_iter = mx.gluon.contrib.data.vision.ImageBboxDataLoader(2, (3, 300, 300), imglist=im_list, path_root='')
+
+        # test batch_size is not divisible by number of images
+        det_iter = mx.gluon.contrib.data.vision.ImageBboxDataLoader(4, (3, 300, 300), imglist=im_list, path_root='')
+        for _ in det_iter:
+            pass
+
+        # test file list with last batch handle
+        fname = './data/test_imagedetiter.lst'
+        im_list = [[k] + _generate_objects() + [x] for k, x in enumerate(TestImage.IMAGES)]
+        with open(fname, 'w') as f:
+            for line in im_list:
+                line = '\t'.join([str(k) for k in line])
+                f.write(line + '\n')
+
+        imageiter_list = [
+            mx.gluon.contrib.data.vision.ImageBboxDataLoader(2, (3, 400, 400),
+                path_imglist=fname, path_root=''),
+            mx.gluon.contrib.data.vision.ImageBboxDataLoader(3, (3, 400, 400),
+                path_imglist=fname, path_root='', last_batch='discard'),
+            mx.gluon.contrib.data.vision.ImageBboxDataLoader(3, (3, 400, 400),
+                path_imglist=fname, path_root='', last_batch='keep'),
+            mx.gluon.contrib.data.vision.ImageBboxDataLoader(3, (3, 400, 400),
+                path_imglist=fname, path_root='', last_batch='rollover'),
+            mx.gluon.contrib.data.vision.ImageBboxDataLoader(3, (3, 400, 400), shuffle=True,
+                path_imglist=fname, path_root='', last_batch='keep')
+        ]
+
+    @use_np
+    def test_bbox_augmenters(self):
+        # only test if all augmenters will work
+        # TODO(Joshua Zhang): verify the augmenter outputs
+        im_list = [_generate_objects() + [x] for x in TestImage.IMAGES]
+        det_iter = mx.gluon.contrib.data.vision.ImageBboxDataLoader(2, (3, 300, 300), imglist=im_list, path_root='',
+            rand_crop=1, rand_pad=1, rand_gray=0.1, rand_mirror=True, mean=True,
+            std=[1.1, 1.03, 1.05], brightness=0.1, contrast=0.1, saturation=0.1,
+            pca_noise=0.1, hue=0.1, inter_method=10,
+            max_aspect_ratio=5, area_range=(0.1, 4.0),
+            max_attempts=50)
+        for batch in det_iter:
+            assert np.dtype(batch[1].dtype) == np.float32, str(np.dtype(batch[1].dtype)) + ': ' + str(batch[1])
+            pass
diff --git a/tests/python/unittest/test_numpy_gluon_data_vision.py b/tests/python/unittest/test_numpy_gluon_data_vision.py
new file mode 100644
index 000000000000..3b76cf4e78ff
--- /dev/null
+++ b/tests/python/unittest/test_numpy_gluon_data_vision.py
@@ -0,0 +1,412 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: skip-file
+from __future__ import absolute_import
+from __future__ import division
+
+import os
+from collections import namedtuple
+from uuid import uuid4
+import numpy as _np
+import mxnet as mx
+from mxnet import gluon, autograd, np, npx
+from mxnet.test_utils import use_np, assert_almost_equal, check_gluon_hybridize_consistency, same, check_symbolic_backward
+from common import with_seed, assertRaises, setup_module, with_seed, teardown
+import random
+from mxnet.base import MXNetError
+from mxnet.gluon.data.vision import transforms
+from mxnet import image
+
+@with_seed()
+@use_np
+def test_to_tensor():
+    # 3D Input
+    data_in = np.random.uniform(0, 255, (300, 300, 3)).astype(dtype=np.uint8)
+    out_nd = transforms.ToTensor()(np.array(data_in, dtype='uint8'))
+    assert_almost_equal(out_nd.asnumpy(), np.transpose(
+                        data_in.astype(dtype=np.float32) / 255.0, (2, 0, 1)))
+
+    # 4D Input
+    data_in = np.random.uniform(0, 255, (5, 300, 300, 3)).astype(dtype=np.uint8)
+    out_nd = transforms.ToTensor()(np.array(data_in, dtype='uint8'))
+    assert_almost_equal(out_nd.asnumpy(), np.transpose(
+                        data_in.astype(dtype=np.float32) / 255.0, (0, 3, 1, 2)))
+
+    # Invalid Input
+    invalid_data_in = np.random.uniform(0, 255, (5, 5, 300, 300, 3)).astype(dtype=np.uint8)
+    transformer = transforms.ToTensor()
+    assertRaises(MXNetError, transformer, invalid_data_in)
+
+    # Bounds (0->0, 255->1)
+    data_in = np.zeros((10, 20, 3)).astype(dtype=np.uint8)
+    out_nd = transforms.ToTensor()(np.array(data_in, dtype='uint8'))
+    assert same(out_nd.asnumpy(), np.transpose(np.zeros(data_in.shape, dtype=np.float32), (2, 0, 1)))
+
+    data_in = np.full((10, 20, 3), 255).astype(dtype=np.uint8)
+    out_nd = transforms.ToTensor()(np.array(data_in, dtype='uint8'))
+    assert same(out_nd.asnumpy(), np.transpose(np.ones(data_in.shape, dtype=np.float32), (2, 0, 1)))
+
+
+@with_seed()
+@use_np
+def test_normalize():
+    # 3D Input
+    data_in_3d = np.random.uniform(0, 1, (3, 300, 300))
+    out_nd_3d = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))(data_in_3d)
+    data_expected_3d = data_in_3d.asnumpy()
+    data_expected_3d[:][:][0] = data_expected_3d[:][:][0] / 3.0
+    data_expected_3d[:][:][1] = (data_expected_3d[:][:][1] - 1.0) / 2.0
+    data_expected_3d[:][:][2] = data_expected_3d[:][:][2] - 2.0
+    assert_almost_equal(data_expected_3d, out_nd_3d.asnumpy())
+
+    # 4D Input
+    data_in_4d = np.random.uniform(0, 1, (2, 3, 300, 300))
+    out_nd_4d = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))(data_in_4d)
+    data_expected_4d = data_in_4d.asnumpy()
+    data_expected_4d[0][:][:][0] = data_expected_4d[0][:][:][0] / 3.0
+    data_expected_4d[0][:][:][1] = (data_expected_4d[0][:][:][1] - 1.0) / 2.0
+    data_expected_4d[0][:][:][2] = data_expected_4d[0][:][:][2] - 2.0
+    data_expected_4d[1][:][:][0] = data_expected_4d[1][:][:][0] / 3.0
+    data_expected_4d[1][:][:][1] = (data_expected_4d[1][:][:][1] - 1.0) / 2.0
+    data_expected_4d[1][:][:][2] = data_expected_4d[1][:][:][2] - 2.0
+    assert_almost_equal(data_expected_4d, out_nd_4d.asnumpy())
+
+    # Invalid Input - Neither 3D or 4D input
+    invalid_data_in = np.random.uniform(0, 1, (5, 5, 3, 300, 300))
+    normalize_transformer = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))
+    assertRaises(MXNetError, normalize_transformer, invalid_data_in)
+
+    # Invalid Input - Channel neither 1 or 3
+    invalid_data_in = np.random.uniform(0, 1, (5, 4, 300, 300))
+    normalize_transformer = transforms.Normalize(mean=(0, 1, 2), std=(3, 2, 1))
+    assertRaises(MXNetError, normalize_transformer, invalid_data_in)
+
+
+@with_seed()
+@use_np
+def test_resize():
+    def _test_resize_with_diff_type(dtype):
+        # test normal case
+        data_in = np.random.uniform(0, 255, (300, 200, 3)).astype(dtype)
+        out_nd = transforms.Resize(200)(data_in)
+        data_expected = mx.image.imresize(data_in, 200, 200, 1)
+        assert_almost_equal(out_nd.asnumpy(), data_expected.asnumpy())
+        # test 4D input
+        data_bath_in = np.random.uniform(0, 255, (3, 300, 200, 3)).astype(dtype)
+        out_batch_nd = transforms.Resize(200)(data_bath_in)
+        for i in range(len(out_batch_nd)):
+            assert_almost_equal(mx.image.imresize(data_bath_in[i], 200, 200, 1).asnumpy(),
+                out_batch_nd[i].asnumpy())
+        # test interp = 2
+        out_nd = transforms.Resize(200, interpolation=2)(data_in)
+        data_expected = mx.image.imresize(data_in, 200, 200, 2)
+        assert_almost_equal(out_nd.asnumpy(), data_expected.asnumpy())
+        # test height not equals to width
+        out_nd = transforms.Resize((200, 100))(data_in)
+        data_expected = mx.image.imresize(data_in, 200, 100, 1)
+        assert_almost_equal(out_nd.asnumpy(), data_expected.asnumpy())
+        # test keep_ratio
+        out_nd = transforms.Resize(150, keep_ratio=True)(data_in)
+        data_expected = mx.image.imresize(data_in, 150, 225, 1)
+        assert_almost_equal(out_nd.asnumpy(), data_expected.asnumpy())
+        # test size below zero
+        invalid_transform = transforms.Resize(-150, keep_ratio=True)
+        assertRaises(MXNetError, invalid_transform, data_in)
+        # test size more than 2:
+        invalid_transform = transforms.Resize((100, 100, 100), keep_ratio=True)
+        assertRaises(MXNetError, invalid_transform, data_in)
+
+    for dtype in ['uint8', 'float32', 'float64']:
+        _test_resize_with_diff_type(dtype)
+
+
+@with_seed()
+@use_np
+def test_crop_resize():
+    def _test_crop_resize_with_diff_type(dtype):
+        # test normal case
+        data_in = np.arange(60).reshape((5, 4, 3)).astype(dtype)
+        out_nd = transforms.CropResize(0, 0, 3, 2)(data_in)
+        out_np = out_nd.asnumpy()
+        assert(out_np.sum() == 180)
+        assert((out_np[0:2,1,1].flatten() == [4, 16]).all())
+        # test 4D input
+        data_bath_in = np.arange(180).reshape((2, 6, 5, 3)).astype(dtype)
+        out_batch_nd = transforms.CropResize(1, 2, 3, 4)(data_bath_in)
+        out_batch_np = out_batch_nd.asnumpy()
+        assert(out_batch_np.sum() == 7524)
+        assert((out_batch_np[0:2,0:4,1,1].flatten() == [37,  52,  67,  82, 127, 142, 157, 172]).all())
+        # test normal case with resize
+        data_in = np.random.uniform(0, 255, (300, 200, 3)).astype(dtype)
+        out_nd = transforms.CropResize(0, 0, 100, 50, (25, 25), 1)(data_in)
+        data_expected = transforms.Resize(size=25, interpolation=1)(data_in[:50, :100, :3]) #nd.slice(data_in, (0, 0, 0), (50, 100, 3)))
+        assert_almost_equal(out_nd.asnumpy(), data_expected.asnumpy())
+        # test 4D input with resize
+        data_bath_in = np.random.uniform(0, 255, (3, 300, 200, 3)).astype(dtype)
+        out_batch_nd = transforms.CropResize(0, 0, 100, 50, (25, 25), 1)(data_bath_in)
+        for i in range(len(out_batch_nd)):
+            actual = transforms.Resize(size=25, interpolation=1)(data_bath_in[i][:50, :100, :3]).asnumpy() #(nd.slice(data_bath_in[i], (0, 0, 0), (50, 100, 3))).asnumpy()
+            expected = out_batch_nd[i].asnumpy()
+            assert_almost_equal(expected, actual)
+        # test with resize height and width should be greater than 0
+        transformer = transforms.CropResize(0, 0, 100, 50, (-25, 25), 1)
+        assertRaises(MXNetError, transformer, data_in)
+        # test height and width should be greater than 0
+        transformer = transforms.CropResize(0, 0, -100, -50)
+        assertRaises(MXNetError, transformer, data_in)
+        # test cropped area is bigger than input data
+        transformer = transforms.CropResize(150, 200, 200, 500)
+        assertRaises(MXNetError, transformer, data_in)
+        assertRaises(MXNetError, transformer, data_bath_in)
+
+    for dtype in ['uint8', 'float32', 'float64']:
+        _test_crop_resize_with_diff_type(dtype)
+
+    # test npx.image.crop backward
+    def test_crop_backward(test_nd_arr, TestCase):
+        a_np = test_nd_arr.asnumpy()
+        b_np = a_np[(slice(TestCase.y, TestCase.y + TestCase.height), slice(TestCase.x, TestCase.x + TestCase.width), slice(0, 3))]
+
+        data = mx.sym.Variable('data')
+        crop_sym = mx.sym.image.crop(data, TestCase.x, TestCase.y, TestCase.width, TestCase.height)
+
+        expected_in_grad = np.zeros_like(np.array(a_np))
+        expected_in_grad[(slice(TestCase.y, TestCase.y + TestCase.height), slice(TestCase.x, TestCase.x + TestCase.width), slice(0, 3))] = b_np
+        check_symbolic_backward(crop_sym, [a_np], [b_np], [expected_in_grad])
+
+    TestCase = namedtuple('TestCase', ['x', 'y', 'width', 'height'])
+    test_list = [TestCase(0, 0, 3, 3), TestCase(2, 1, 1, 2), TestCase(0, 1, 3, 2)]
+
+    for dtype in ['uint8', 'float32', 'float64']:
+        data_in = np.arange(60).reshape((5, 4, 3)).astype(dtype)
+        for test_case in test_list:
+            test_crop_backward(data_in, test_case)
+
+
+@with_seed()
+@use_np
+def test_flip_left_right():
+    data_in = np.random.uniform(0, 255, (300, 300, 3)).astype(dtype=np.uint8)
+    flip_in = data_in[:, ::-1, :]
+    data_trans = npx.image.flip_left_right(np.array(data_in, dtype='uint8'))
+    assert_almost_equal(flip_in, data_trans.asnumpy())
+
+
+@with_seed()
+@use_np
+def test_flip_top_bottom():
+    data_in = np.random.uniform(0, 255, (300, 300, 3)).astype(dtype=np.uint8)
+    flip_in = data_in[::-1, :, :]
+    data_trans = npx.image.flip_top_bottom(np.array(data_in, dtype='uint8'))
+    assert_almost_equal(flip_in, data_trans.asnumpy())
+
+
+@with_seed()
+@use_np
+def test_transformer():
+    from mxnet.gluon.data.vision import transforms
+
+    transform = transforms.Compose([
+        transforms.Resize(300),
+        transforms.Resize(300, keep_ratio=True),
+        transforms.CenterCrop(256),
+        transforms.RandomCrop(256, pad=16),
+        transforms.RandomResizedCrop(224),
+        transforms.RandomFlipLeftRight(),
+        transforms.RandomColorJitter(0.1, 0.1, 0.1, 0.1),
+        transforms.RandomBrightness(0.1),
+        transforms.RandomContrast(0.1),
+        transforms.RandomSaturation(0.1),
+        transforms.RandomHue(0.1),
+        transforms.RandomLighting(0.1),
+        transforms.ToTensor(),
+        transforms.RandomRotation([-10., 10.]),
+        transforms.Normalize([0, 0, 0], [1, 1, 1])])
+
+    transform(mx.np.ones((245, 480, 3), dtype='uint8')).wait_to_read()
+
+@with_seed()
+@use_np
+def test_random_crop():
+    x = mx.np.ones((245, 480, 3), dtype='uint8')
+    y = mx.npx.image.random_crop(x, width=100, height=100)
+    assert y.shape == (100, 100, 3)
+
+@with_seed()
+@use_np
+def test_random_resize_crop():
+    x = mx.np.ones((245, 480, 3), dtype='uint8')
+    y = mx.npx.image.random_resized_crop(x, width=100, height=100)
+    assert y.shape == (100, 100, 3)
+
+@with_seed()
+@use_np
+def test_hybrid_transformer():
+    from mxnet.gluon.data.vision import transforms
+
+    transform = transforms.HybridCompose([
+        transforms.Resize(300),
+        transforms.Resize(300, keep_ratio=True),
+        transforms.CenterCrop(256),
+        transforms.RandomCrop(256, pad=16),
+        transforms.RandomFlipLeftRight(),
+        transforms.RandomColorJitter(0.1, 0.1, 0.1, 0.1),
+        transforms.RandomBrightness(0.1),
+        transforms.RandomContrast(0.1),
+        transforms.RandomSaturation(0.1),
+        transforms.RandomHue(0.1),
+        transforms.RandomLighting(0.1),
+        transforms.ToTensor(),
+        transforms.Normalize([0, 0, 0], [1, 1, 1])])
+
+    transform(mx.np.ones((245, 480, 3), dtype='uint8')).wait_to_read()
+
+@with_seed()
+@use_np
+def test_rotate():
+    transformer = transforms.Rotate(10.)
+    assertRaises(TypeError, transformer, mx.np.ones((3, 30, 60), dtype='uint8'))
+    single_image = mx.np.ones((3, 30, 60), dtype='float32')
+    single_output = transformer(single_image)
+    assert same(single_output.shape, (3, 30, 60))
+    batch_image = mx.np.ones((3, 3, 30, 60), dtype='float32')
+    batch_output = transformer(batch_image)
+    assert same(batch_output.shape, (3, 3, 30, 60))
+
+    input_image = np.array([[[0., 0., 0.],
+                             [0., 0., 1.],
+                             [0., 0., 0.]]])
+    rotation_angles_expected_outs = [
+        (90., np.array([[[0., 1., 0.],
+                         [0., 0., 0.],
+                         [0., 0., 0.]]])),
+        (180., np.array([[[0., 0., 0.],
+                          [1., 0., 0.],
+                          [0., 0., 0.]]])),
+        (270., np.array([[[0., 0., 0.],
+                          [0., 0., 0.],
+                          [0., 1., 0.]]])),
+        (360., np.array([[[0., 0., 0.],
+                          [0., 0., 1.],
+                          [0., 0., 0.]]])),
+    ]
+    for rot_angle, expected_result in rotation_angles_expected_outs:
+        transformer = transforms.Rotate(rot_angle)
+        ans = transformer(input_image)
+        print(type(ans), ans, type(expected_result), expected_result)
+        assert_almost_equal(ans.asnumpy(), expected_result.asnumpy(), atol=1e-6)
+
+
+@with_seed()
+@use_np
+def test_random_rotation():
+    # test exceptions for probability input outside of [0,1]
+    assertRaises(ValueError, transforms.RandomRotation, [-10, 10.], rotate_with_proba=1.1)
+    assertRaises(ValueError, transforms.RandomRotation, [-10, 10.], rotate_with_proba=-0.3)
+    # test `forward`
+    transformer = transforms.RandomRotation([-10, 10.])
+    assertRaises(TypeError, transformer, mx.np.ones((3, 30, 60), dtype='uint8'))
+    single_image = mx.np.ones((3, 30, 60), dtype='float32')
+    single_output = transformer(single_image)
+    assert same(single_output.shape, (3, 30, 60))
+    batch_image = mx.np.ones((3, 3, 30, 60), dtype='float32')
+    batch_output = transformer(batch_image)
+    assert same(batch_output.shape, (3, 3, 30, 60))
+    # test identity (rotate_with_proba = 0)
+    transformer = transforms.RandomRotation([-100., 100.], rotate_with_proba=0.0)
+    data = mx.np.random.normal(size=(3, 30, 60))
+    assert_almost_equal(data.asnumpy(), transformer(data).asnumpy())
+
+
+@with_seed()
+@use_np
+def test_random_transforms():
+    from mxnet.gluon.data.vision import transforms
+
+    tmp_t = transforms.Compose([transforms.Resize(300), transforms.RandomResizedCrop(224)])
+    transform = transforms.Compose([transforms.RandomApply(tmp_t, 0.5)])
+
+    img = mx.np.ones((10, 10, 3), dtype='uint8')
+    iteration = 1000
+    num_apply = 0
+    for _ in range(iteration):
+        out = transform(img)
+        if out.shape[0] == 224:
+            num_apply += 1
+    assert_almost_equal(num_apply/float(iteration), 0.5, 0.1)
+
+@with_seed()
+@use_np
+def test_random_gray():
+    from mxnet.gluon.data.vision import transforms
+
+    transform = transforms.RandomGray(0.5)
+    img = mx.np.ones((4, 4, 3), dtype='uint8')
+    pixel = img[0, 0, 0].asnumpy()
+    iteration = 1000
+    num_apply = 0
+    for _ in range(iteration):
+        out = transform(img)
+        if out[0][0][0].asnumpy() != pixel:
+            num_apply += 1
+    assert_almost_equal(num_apply/float(iteration), 0.5, 0.1)
+
+    transform = transforms.RandomGray(0.5)
+    transform.hybridize()
+    img = mx.np.ones((4, 4, 3), dtype='uint8')
+    pixel = img[0, 0, 0].asnumpy()
+    iteration = 1000
+    num_apply = 0
+    for _ in range(iteration):
+        out = transform(img)
+        if out[0][0][0].asnumpy() != pixel:
+            num_apply += 1
+    assert_almost_equal(num_apply/float(iteration), 0.5, 0.1)
+
+@with_seed()
+@use_np
+def test_bbox_random_flip():
+    from mxnet.gluon.contrib.data.vision.transforms.bbox import ImageBboxRandomFlipLeftRight
+
+    transform = ImageBboxRandomFlipLeftRight(0.5)
+    iteration = 200
+    num_apply = 0
+    for _ in range(iteration):
+        img = mx.np.ones((10, 10, 3), dtype='uint8')
+        img[0, 0, 0] = 10
+        bbox = mx.np.array([[1, 2, 3, 4, 0]])
+        im_out, im_bbox = transform(img, bbox)
+        if im_bbox[0][0].asnumpy() != 1 and im_out[0, 0, 0].asnumpy() != 10:
+            num_apply += 1
+    assert_almost_equal(np.array([num_apply])/float(iteration), 0.5, 0.5)
+
+@with_seed()
+@use_np
+def test_bbox_crop():
+    from mxnet.gluon.contrib.data.vision.transforms.bbox import ImageBboxCrop
+
+    transform = ImageBboxCrop((0, 0, 3, 3))
+    img = mx.np.ones((10, 10, 3), dtype='uint8')
+    bbox = mx.np.array([[0, 1, 3, 4, 0]])
+    im_out, im_bbox = transform(img, bbox)
+    assert im_out.shape == (3, 3, 3)
+    assert im_bbox[0][2] == 3
+
+if __name__ == '__main__':
+    import nose
+    nose.runmodule()
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index 5401e61c8f86..c1930e0dbf0f 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -7463,6 +7463,56 @@ def hybrid_forward(self, F, a, *args, **kwargs):
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
 
 
+@with_seed()
+@use_np
+def test_np_einsum():
+    class TestEinsum(HybridBlock):
+        def __init__(self, subscripts, optimize):
+            super(TestEinsum, self).__init__()
+            self.subscripts = subscripts
+            self.optimize = optimize
+
+        def hybrid_forward(self, F, a, *args, **kwargs):
+            return F.np.dsplit(a, indices_or_sections=self._indices_or_sections)
+
+    shapes = [
+        (2, 4, 6),
+        (3, 0, 6),
+        (2, 3, 0, 4),
+    ]
+    indices_or_sections_num = [
+        (2, 4),
+        (3, 3),
+        (3,),
+        (1,),
+        2,
+    ]
+    for hybridize in [True, False]:
+        for shape in shapes:
+            for indices_or_sections in indices_or_sections_num:
+                # test gluon
+                test_dsplit = TestDSplit(indices_or_sections=indices_or_sections)
+                if hybridize:
+                    test_dsplit.hybridize()
+
+                a = mx.nd.random.uniform(-1.0, 1.0, shape=shape).as_np_ndarray()
+                a.attach_grad()
+                expected_ret = _np.dsplit(a.asnumpy(), indices_or_sections=indices_or_sections)
+                with mx.autograd.record():
+                    y = test_dsplit(a)
+                assert len(y) == len(expected_ret)
+                for mx_out, np_out in zip(y, expected_ret):
+                    assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+                mx.autograd.backward(y)
+                assert_almost_equal(a.grad.asnumpy(), _np.ones(a.shape), rtol=1e-3, atol=1e-5)
+
+                # test imperative
+                mx_outs = np.dsplit(a, indices_or_sections=indices_or_sections)
+                np_outs = _np.dsplit(a.asnumpy(), indices_or_sections=indices_or_sections)
+                for mx_out, np_out in zip(mx_outs, np_outs):
+                    assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
+
+
 @with_seed()
 @use_np
 def test_np_einsum():
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 2f7f4dee8481..6223c1e39f87 100644
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -956,4 +956,3 @@ def test_cosine_scheduler():
     np.testing.assert_almost_equal(cosine_sched(0), base_lr)
     np.testing.assert_almost_equal(cosine_sched(steps), final_lr)
     assert (cosine_sched(500) > 1.5)
-
diff --git a/tools/pip/doc/CU90_ADDITIONAL.md b/tools/pip/doc/CU90_ADDITIONAL.md
new file mode 100644
index 000000000000..3f51c50520f2
--- /dev/null
+++ b/tools/pip/doc/CU90_ADDITIONAL.md
@@ -0,0 +1,47 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+**CUDA 9.0 package for MXNet is no longer maintained for new releases.**
+
+Prerequisites
+-------------
+This package supports Linux and Windows platforms. You may also want to check:
+- [mxnet-cu102](https://pypi.python.org/pypi/mxnet-cu102/) with CUDA-10.2 support.
+- [mxnet-cu101](https://pypi.python.org/pypi/mxnet-cu101/) with CUDA-10.1 support.
+- [mxnet-cu100](https://pypi.python.org/pypi/mxnet-cu100/) with CUDA-10.0 support.
+- [mxnet-cu92](https://pypi.python.org/pypi/mxnet-cu92/) with CUDA-9.2 support.
+- [mxnet-cu90](https://pypi.python.org/pypi/mxnet-cu90/) with CUDA-9.0 support.
+- [mxnet](https://pypi.python.org/pypi/mxnet/).
+- [mxnet-native](https://pypi.python.org/pypi/mxnet-native/) CPU variant without MKLDNN.
+
+To download CUDA, check [CUDA download](https://developer.nvidia.com/cuda-downloads). For more instructions, check [CUDA Toolkit online documentation](http://docs.nvidia.com/cuda/index.html).
+
+To install for other platforms (e.g. Windows, Raspberry Pi/ARM) or other versions, check [Installing MXNet](https://mxnet.apache.org/versions/master/install/index.html) for instructions on building from source.
+
+Installation
+------------
+To install:
+```bash
+pip install mxnet-cu90
+```
+
+Nightly Builds
+--------------
+To install the latest nightly build, use:
+```bash
+pip install --pre mxnet-cu90 -f https://dist.mxnet.io/python
+```
diff --git a/tools/staticbuild/README.md b/tools/staticbuild/README.md
index 077150e11762..b06f3b9a5a05 100644
--- a/tools/staticbuild/README.md
+++ b/tools/staticbuild/README.md
@@ -27,7 +27,7 @@ environment variable settings. Here are examples you can run with this script:
 ```
 tools/staticbuild/build.sh cu102
 ```
-This would build the mxnet package based on CUDA 10.2. Currently, we support variants cpu, native, cu92, cu100, cu101 and cu102. All of these variants expect native have MKL-DNN backend enabled. 
+This would build the mxnet package based on CUDA 10.2. Currently, we support variants cpu, native, cu92, cu100, cu101 and cu102. All of these variants expect native have MKL-DNN backend enabled.
 
 ```
 tools/staticbuild/build.sh cpu

From 6f5f5270166d89c8d3e40279e6d9038e7083a499 Mon Sep 17 00:00:00 2001
From: "Joshua Z. Zhang" <cheungchih@gmail.com>
Date: Fri, 17 Apr 2020 13:40:06 -0700
Subject: [PATCH 02/17] update

---
 cd/README.md                                  |   4 +-
 cd/utils/artifact_repository.md               |   7 +-
 ci/docker/Dockerfile.build.test.armv7         |   2 +-
 ci/docker/Dockerfile.build.test.armv8         |   7 +-
 ci/docker/Dockerfile.build.ubuntu_rat         |   4 +-
 ci/docker/Dockerfile.publish.centos7_gpu_cu90 |  43 -----
 .../Dockerfile.publish.test.ubuntu1604_cpu    |  18 +-
 .../Dockerfile.publish.test.ubuntu1604_gpu    |  24 ++-
 .../Dockerfile.publish.test.ubuntu1804_cpu    |  11 +-
 ci/docker/install/ubuntu_julia.sh             |   3 +
 ci/docker/install/ubuntu_scala.sh             |   5 +-
 ci/docker/install/ubuntu_tutorials.sh         |   9 +-
 config/distribution/linux_cu90.cmake          |  36 ----
 config/distribution/linux_cu91.cmake          |  36 ----
 .../clojure-package/examples/rnn/get_data.sh  |   0
 make/staticbuild/linux_cu90.mk                | 180 ------------------
 make/staticbuild/linux_cu91.mk                | 180 ------------------
 python/mxnet/ndarray/numpy/_op.py             |  93 ---------
 python/mxnet/numpy/multiarray.py              | 174 -----------------
 .../examples/scripts/module/mnist_mlp.sh      |   7 +-
 .../unittest/test_numpy_interoperability.py   |   8 +-
 tests/python/unittest/test_numpy_op.py        |  72 ++-----
 tools/pip/doc/CU90_ADDITIONAL.md              |  47 -----
 tools/staticbuild/README.md                   |   2 +-
 24 files changed, 63 insertions(+), 909 deletions(-)
 delete mode 100644 ci/docker/Dockerfile.publish.centos7_gpu_cu90
 mode change 100644 => 100755 ci/docker/install/ubuntu_tutorials.sh
 delete mode 100644 config/distribution/linux_cu90.cmake
 delete mode 100644 config/distribution/linux_cu91.cmake
 mode change 100644 => 100755 contrib/clojure-package/examples/rnn/get_data.sh
 delete mode 100644 make/staticbuild/linux_cu90.mk
 delete mode 100644 make/staticbuild/linux_cu91.mk
 delete mode 100644 tools/pip/doc/CU90_ADDITIONAL.md

diff --git a/cd/README.md b/cd/README.md
index 8247af964906..30cd44bd1d14 100644
--- a/cd/README.md
+++ b/cd/README.md
@@ -25,7 +25,7 @@ MXNet aims to support a variety of frontends, e.g. Python, Java, Perl, R, etc. a
 
 The CD process is driven by the [CD pipeline job](Jenkinsfile_cd_pipeline), which orchestrates the order in which the artifacts are delivered. For instance, first publish the libmxnet library before publishing the pip package. It does this by triggering the [release job](Jenkinsfile_release_job) with a specific set of parameters for each delivery channel. The release job executes the specific release pipeline for a delivery channel across all MXNet *variants*.
 
-A variant is a specific environment or features for which MXNet is compiled. For instance CPU, GPU with CUDA v10.0, CUDA v9.0 with MKL-DNN support, etc.
+A variant is a specific environment or features for which MXNet is compiled. For instance CPU, GPU with CUDA v10.0, CUDA v9.0 with MKL-DNN support, etc. 
 
 Currently, below variants are supported. All of these variants except native have MKL-DNN backend enabled.
 
@@ -120,7 +120,7 @@ The "first mile" of the CD process is posting the mxnet binaries to the [artifac
 
 ##### Timeout
 
-We shouldn't set global timeouts for the pipelines. Rather, the `step` being executed should be rapped with a `timeout` function (as in the pipeline example above). The `max_time` is a global variable set at the [release job](Jenkinsfile_release_job) level.
+We shouldn't set global timeouts for the pipelines. Rather, the `step` being executed should be rapped with a `timeout` function (as in the pipeline example above). The `max_time` is a global variable set at the [release job](Jenkinsfile_release_job) level. 
 
 ##### Node of execution
 
diff --git a/cd/utils/artifact_repository.md b/cd/utils/artifact_repository.md
index 80297efca2c8..5ee736f2d26e 100644
--- a/cd/utils/artifact_repository.md
+++ b/cd/utils/artifact_repository.md
@@ -33,7 +33,7 @@ An mxnet compiled library, or artifact for our purposes, is identified by the fo
 
 **Commit Id**
 
-Manually configured through the --git-sha argument.
+Manually configured through the --git-sha argument. 
 
 If not set, derived by:
 
@@ -59,7 +59,7 @@ As long as the tool is being run from the MXNet code base, the runtime feature d
 
 If it has been compiled with CUDA support, the output of /usr/local/cuda/bin/nvcc --version can be mined for the exact CUDA version (eg. 8.0, 9.0, etc.).
 
-By knowing which features are enabled on the binary, and if necessary, which CUDA version is installed on the machine, the value for the variant argument can be calculated. Eg. if CUDA features are enabled, and nvcc reports cuda version 10, then the variant would be cu100. If neither MKL-DNN nor CUDA features are enabled, the variant would be native.
+By knowing which features are enabled on the binary, and if necessary, which CUDA version is installed on the machine, the value for the variant argument can be calculated. Eg. if CUDA features are enabled, and nvcc reports cuda version 10, then the variant would be cu100. If neither MKL-DNN nor CUDA features are enabled, the variant would be native. 
 
 **Dependency Linking**
 
@@ -68,7 +68,7 @@ The library dependencies can be either statically or dynamically linked. This pr
 ### Uploading an Artifact
 
 The user must specify the path to the libmxnet.so, any license files, and any dependencies. The latter two are optional.
-
+ 
 Example:
 
 `./artifact_repository.py --push --static --libmxnet /path/to/libmxnet.so --licenses path/to/license1.txt /path/to/other_licenses/*.txt --dependencies /path/to/dependencies/*.so`
@@ -102,3 +102,4 @@ dist
 ```
 
 The libmxnet.meta file will include the characteristics of the artifact (ie. library type, variant, git commit id, etc.) in a “property” file format.
+
diff --git a/ci/docker/Dockerfile.build.test.armv7 b/ci/docker/Dockerfile.build.test.armv7
index 066040c5be8f..711bf651491f 100644
--- a/ci/docker/Dockerfile.build.test.armv7
+++ b/ci/docker/Dockerfile.build.test.armv7
@@ -47,4 +47,4 @@ COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
-WORKDIR /work/mxnet
+WORKDIR /work/mxnet
\ No newline at end of file
diff --git a/ci/docker/Dockerfile.build.test.armv8 b/ci/docker/Dockerfile.build.test.armv8
index c10d23ad8efe..920211440ef6 100644
--- a/ci/docker/Dockerfile.build.test.armv8
+++ b/ci/docker/Dockerfile.build.test.armv8
@@ -22,11 +22,6 @@ FROM arm64v8/ubuntu:20.04
 
 WORKDIR /usr/local
 
-
-WORKDIR /usr/local
-
-COPY install/ubuntu_rat.sh /work/
-RUN /work/ubuntu_rat.sh
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
     python3 \
     python3-pip \
@@ -50,4 +45,4 @@ COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
-WORKDIR /work/mxnet
+WORKDIR /work/mxnet
\ No newline at end of file
diff --git a/ci/docker/Dockerfile.build.ubuntu_rat b/ci/docker/Dockerfile.build.ubuntu_rat
index 7536057d73ff..234d2e42e946 100644
--- a/ci/docker/Dockerfile.build.ubuntu_rat
+++ b/ci/docker/Dockerfile.build.ubuntu_rat
@@ -20,7 +20,7 @@
 
 FROM ubuntu:16.04
 
-WORKDIR /usr/local
+WORKDIR /work/deps
 
 COPY install/ubuntu_rat.sh /work/
 RUN /work/ubuntu_rat.sh
@@ -31,4 +31,6 @@ COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
+
 WORKDIR /work/mxnet
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/Dockerfile.publish.centos7_gpu_cu90 b/ci/docker/Dockerfile.publish.centos7_gpu_cu90
deleted file mode 100644
index 23217148f87c..000000000000
--- a/ci/docker/Dockerfile.publish.centos7_gpu_cu90
+++ /dev/null
@@ -1,43 +0,0 @@
-# -*- mode: dockerfile -*-
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-FROM nvidia/cuda:9.0-cudnn7-devel-centos7
-
-WORKDIR /work/deps
-
-COPY install/centos7_base.sh /work/
-RUN /work/centos7_base.sh
-COPY install/centos7_ccache.sh /work/
-RUN /work/centos7_ccache.sh
-COPY install/centos7_python.sh /work/
-RUN /work/centos7_python.sh
-COPY install/centos7_scala.sh /work/
-RUN /work/centos7_scala.sh
-ENV SHORT_CUDA_VERSION=9.0
-ENV SHORT_NCCL_VERSION=2.4.8
-COPY install/centos7_nccl.sh /work/
-RUN /work/centos7_nccl.sh
-
-ARG USER_ID=0
-COPY install/centos7_adduser.sh /work/
-RUN /work/centos7_adduser.sh
-
-ENV PYTHONPATH=./python/
-WORKDIR /work/mxnet
-
-COPY runtime_functions.sh /work/
diff --git a/ci/docker/Dockerfile.publish.test.ubuntu1604_cpu b/ci/docker/Dockerfile.publish.test.ubuntu1604_cpu
index 0f7ff1e3714b..bbb7b6a0d7bd 100644
--- a/ci/docker/Dockerfile.publish.test.ubuntu1604_cpu
+++ b/ci/docker/Dockerfile.publish.test.ubuntu1604_cpu
@@ -20,17 +20,13 @@
 
 FROM ubuntu:16.04
 
-WORKDIR /usr/local
+WORKDIR /work/deps
 
-RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y \
-    python3 \
-    python3-pip \
-    python3-numpy \
-    python3-scipy \
-    python3-nose \
-    python3-nose-timer \
-    python3-requests \
- && rm -rf /var/lib/apt/lists/*
+COPY install/ubuntu_base.sh /work/
+RUN /work/ubuntu_base.sh
+
+COPY install/ubuntu_scala.sh /work/
+RUN /work/ubuntu_scala.sh
 
 ARG USER_ID=0
 ARG GROUP_ID=0
@@ -38,4 +34,6 @@ COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
+
 WORKDIR /work/mxnet
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/Dockerfile.publish.test.ubuntu1604_gpu b/ci/docker/Dockerfile.publish.test.ubuntu1604_gpu
index 8276536bf10a..660461dc0cfa 100644
--- a/ci/docker/Dockerfile.publish.test.ubuntu1604_gpu
+++ b/ci/docker/Dockerfile.publish.test.ubuntu1604_gpu
@@ -22,20 +22,18 @@ FROM nvidia/cuda:9.2-cudnn7-devel-ubuntu16.04
 
 WORKDIR /work/deps
 
-COPY install/centos7_base.sh /work/
-RUN /work/centos7_base.sh
-COPY install/centos7_ccache.sh /work/
-RUN /work/centos7_ccache.sh
-COPY install/centos7_python.sh /work/
-RUN /work/centos7_python.sh
-COPY install/centos7_scala.sh /work/
-RUN /work/centos7_scala.sh
+COPY install/ubuntu_base.sh /work/
+RUN /work/ubuntu_base.sh
 
-ARG USER_ID=0
-COPY install/centos7_adduser.sh /work/
-RUN /work/centos7_adduser.sh
+COPY install/ubuntu_scala.sh /work/
+RUN /work/ubuntu_scala.sh
 
-ENV PYTHONPATH=./python/
-WORKDIR /work/mxnet
+ARG USER_ID=0
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
+
+WORKDIR /work/mxnet
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/Dockerfile.publish.test.ubuntu1804_cpu b/ci/docker/Dockerfile.publish.test.ubuntu1804_cpu
index fe3a955b9a73..e3a8c193f234 100644
--- a/ci/docker/Dockerfile.publish.test.ubuntu1804_cpu
+++ b/ci/docker/Dockerfile.publish.test.ubuntu1804_cpu
@@ -31,10 +31,11 @@ COPY install/ubuntu_scala.sh /work/
 RUN /work/ubuntu_scala.sh
 
 ARG USER_ID=0
-COPY install/centos7_adduser.sh /work/
-RUN /work/centos7_adduser.sh
-
-ENV PYTHONPATH=./python/
-WORKDIR /work/mxnet
+ARG GROUP_ID=0
+COPY install/ubuntu_adduser.sh /work/
+RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
+
+WORKDIR /work/mxnet
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:/usr/local/lib
diff --git a/ci/docker/install/ubuntu_julia.sh b/ci/docker/install/ubuntu_julia.sh
index 348a6d13da07..435ec46db6c7 100755
--- a/ci/docker/install/ubuntu_julia.sh
+++ b/ci/docker/install/ubuntu_julia.sh
@@ -17,6 +17,9 @@
 # specific language governing permissions and limitations
 # under the License.
 
+# build and install are separated so changes to build don't invalidate
+# the whole docker cache for the image
+
 set -ex
 
 function install_julia() {
diff --git a/ci/docker/install/ubuntu_scala.sh b/ci/docker/install/ubuntu_scala.sh
index e3afd8e728ee..355e978e075c 100755
--- a/ci/docker/install/ubuntu_scala.sh
+++ b/ci/docker/install/ubuntu_scala.sh
@@ -17,9 +17,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# Install Thrust 1.9.8 to be shipped with Cuda 11.
-# Fixes https://github.com/thrust/thrust/issues/1072 for Clang 10
-# This file can be deleted when using Cuda 11 on CI
+# build and install are separated so changes to build don't invalidate
+# the whole docker cache for the image
 
 set -ex
 
diff --git a/ci/docker/install/ubuntu_tutorials.sh b/ci/docker/install/ubuntu_tutorials.sh
old mode 100644
new mode 100755
index 0b2204f1f9a5..469df6190ea4
--- a/ci/docker/install/ubuntu_tutorials.sh
+++ b/ci/docker/install/ubuntu_tutorials.sh
@@ -1,3 +1,5 @@
+#!/usr/bin/env bash
+
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -15,11 +17,8 @@
 # specific language governing permissions and limitations
 # under the License.
 
-set(CMAKE_SYSTEM_NAME Linux)
-set(CMAKE_SYSTEM_PROCESSOR "armv7l")
-set(CMAKE_C_COMPILER arm-linux-gnueabihf-gcc)
-set(CMAKE_CXX_COMPILER arm-linux-gnueabihf-g++)
-set(CMAKE_FIND_ROOT_PATH "/usr/arm-linux-gnueabihf" "/usr/local/arm-linux-gnueabihf")
+# build and install are separated so changes to build don't invalidate
+# the whole docker cache for the image
 
 set -ex
 apt-get update || true
diff --git a/config/distribution/linux_cu90.cmake b/config/distribution/linux_cu90.cmake
deleted file mode 100644
index e4249cd609c8..000000000000
--- a/config/distribution/linux_cu90.cmake
+++ /dev/null
@@ -1,36 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set(CMAKE_BUILD_TYPE "Distribution" CACHE STRING "Build type")
-set(CFLAGS "-mno-avx" CACHE STRING "CFLAGS")
-set(CXXFLAGS "-mno-avx" CACHE STRING "CXXFLAGS")
-
-set(USE_CUDA ON CACHE BOOL "Build with CUDA support")
-set(USE_CUDNN ON CACHE BOOL "Build with CUDNN support")
-set(USE_NCCL ON CACHE BOOL "Build with NCCL support")
-set(USE_OPENCV ON CACHE BOOL "Build with OpenCV support")
-set(USE_OPENMP ON CACHE BOOL "Build with Openmp support")
-set(USE_MKL_IF_AVAILABLE OFF CACHE BOOL "Use Intel MKL if found")
-set(USE_MKLDNN ON CACHE BOOL "Build with MKL-DNN support")
-set(USE_LAPACK ON CACHE BOOL "Build with lapack support")
-set(USE_TVM_OP OFF CACHE BOOL "Enable use of TVM operator build system.")
-set(USE_SSE ON CACHE BOOL "Build with x86 SSE instruction support")
-set(USE_F16C OFF CACHE BOOL "Build with x86 F16C instruction support")
-set(USE_LIBJPEG_TURBO ON CACHE BOOL "Build with libjpeg-turbo")
-
-set(CUDACXX "/usr/local/cuda-9.0/bin/nvcc" CACHE STRING "Cuda compiler")
-set(MXNET_CUDA_ARCH "3.0;5.0;6.0;7.0;7.2" CACHE STRING "Cuda architectures")
diff --git a/config/distribution/linux_cu91.cmake b/config/distribution/linux_cu91.cmake
deleted file mode 100644
index a239ada43454..000000000000
--- a/config/distribution/linux_cu91.cmake
+++ /dev/null
@@ -1,36 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-set(CMAKE_BUILD_TYPE "Distribution" CACHE STRING "Build type")
-set(CFLAGS "-mno-avx" CACHE STRING "CFLAGS")
-set(CXXFLAGS "-mno-avx" CACHE STRING "CXXFLAGS")
-
-set(USE_CUDA ON CACHE BOOL "Build with CUDA support")
-set(USE_CUDNN ON CACHE BOOL "Build with CUDNN support")
-set(USE_NCCL ON CACHE BOOL "Build with NCCL support")
-set(USE_OPENCV ON CACHE BOOL "Build with OpenCV support")
-set(USE_OPENMP ON CACHE BOOL "Build with Openmp support")
-set(USE_MKL_IF_AVAILABLE OFF CACHE BOOL "Use Intel MKL if found")
-set(USE_MKLDNN ON CACHE BOOL "Build with MKL-DNN support")
-set(USE_LAPACK ON CACHE BOOL "Build with lapack support")
-set(USE_TVM_OP OFF CACHE BOOL "Enable use of TVM operator build system.")
-set(USE_SSE ON CACHE BOOL "Build with x86 SSE instruction support")
-set(USE_F16C OFF CACHE BOOL "Build with x86 F16C instruction support")
-set(USE_LIBJPEG_TURBO ON CACHE BOOL "Build with libjpeg-turbo")
-
-set(CUDACXX "/usr/local/cuda-9.1/bin/nvcc" CACHE STRING "Cuda compiler")
-set(MXNET_CUDA_ARCH "3.0;5.0;6.0;7.0;7.2" CACHE STRING "Cuda architectures")
diff --git a/contrib/clojure-package/examples/rnn/get_data.sh b/contrib/clojure-package/examples/rnn/get_data.sh
old mode 100644
new mode 100755
diff --git a/make/staticbuild/linux_cu90.mk b/make/staticbuild/linux_cu90.mk
deleted file mode 100644
index 1d0669ef82b6..000000000000
--- a/make/staticbuild/linux_cu90.mk
+++ /dev/null
@@ -1,180 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------------------------
-#  Template configuration for compiling mxnet for making python wheel
-#-------------------------------------------------------------------------------
-
-#---------------------
-# choice of compiler
-#--------------------
-
-export CC = gcc
-export CXX = g++
-export NVCC = nvcc
-
-# whether compile with options for MXNet developer
-DEV = 0
-
-# whether compile with debug
-DEBUG = 0
-
-# whether to turn on signal handler (e.g. segfault logger)
-USE_SIGNAL_HANDLER = 1
-
-# the additional link flags you want to add
-ifdef USE_SYSTEM_CUDA
-ADD_LDFLAGS += -L$(DEPS_PATH)/lib -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
-else
-ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
-endif
-
-# the additional compile flags you want to add
-ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
-
-#---------------------------------------------
-# matrix computation libraries for CPU/GPU
-#---------------------------------------------
-
-# choose the version of blas you want to use
-# can be: mkl, blas, atlas, openblas
-# in default use atlas for linux while apple for osx
-USE_BLAS=openblas
-
-# whether use opencv during compilation
-# you can disable it, however, you will not able to use
-# imbin iterator
-USE_OPENCV = 1
-# Add OpenCV include path, in which the directory `opencv2` exists
-USE_OPENCV_INC_PATH = NONE
-# Add OpenCV shared library path, in which the shared library exists
-USE_OPENCV_LIB_PATH = NONE
-
-# whether use CUDA during compile
-USE_CUDA = 1
-
-# add the path to CUDA library to link and compile flag
-# if you have already add them to environment variable, leave it as NONE
-# USE_CUDA_PATH = /usr/local/cuda
-ifdef USE_SYSTEM_CUDA
-USE_CUDA_PATH = /usr/local/cuda-9.0
-else
-USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-9.0
-endif
-
-# whether to use CuDNN library
-USE_CUDNN = 1
-
-# whether to use NCCL library
-USE_NCCL = 1
-
-# CUDA architecture setting: going with all of them.
-# For CUDA < 6.0, comment the *_50 lines for compatibility.
-# CUDA_ARCH :=
-
-# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
-ENABLE_CUDA_RTC = 1
-
-USE_NVTX=1
-
-# use openmp for parallelization
-USE_OPENMP = 1
-USE_OPERATOR_TUNING = 1
-USE_LIBJPEG_TURBO = 1
-
-# whether use MKL-DNN library
-USE_MKLDNN = 1
-
-# whether use NNPACK library
-USE_NNPACK = 0
-
-# whether use lapack during compilation
-# only effective when compiled with blas versions openblas/apple/atlas/mkl
-USE_LAPACK = 1
-
-# path to lapack library in case of a non-standard installation
-USE_LAPACK_PATH = $(DEPS_PATH)/lib
-
-# add path to intel library, you may need it for MKL, if you did not add the path
-# to environment variable
-USE_INTEL_PATH = NONE
-
-# If use MKL, choose static link automatically to allow python wrapper
-ifeq ($(USE_BLAS), mkl)
-USE_STATIC_MKL = 1
-else
-USE_STATIC_MKL = NONE
-endif
-
-#----------------------------
-# Settings for power and arm arch
-#----------------------------
-ARCH := $(shell uname -a)
-ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
-	USE_SSE=0
-else
-	USE_SSE=1
-endif
-
-#----------------------------
-# distributed computing
-#----------------------------
-
-# whether or not to enable multi-machine supporting
-USE_DIST_KVSTORE = 1
-
-# whether or not allow to read and write HDFS directly. If yes, then hadoop is
-# required
-USE_HDFS = 0
-
-# path to libjvm.so. required if USE_HDFS=1
-LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
-
-# whether or not allow to read and write AWS S3 directly. If yes, then
-# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
-# sudo apt-get install -y libcurl4-openssl-dev
-USE_S3 = 1
-
-#----------------------------
-# additional operators
-#----------------------------
-
-# path to folders containing projects specific operators that you don't want to put in src/operators
-EXTRA_OPERATORS =
-
-
-#----------------------------
-# plugins
-#----------------------------
-
-# whether to use caffe integration. This requires installing caffe.
-# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
-# CAFFE_PATH = $(HOME)/caffe
-# MXNET_PLUGINS += plugin/caffe/caffe.mk
-
-# whether to use torch integration. This requires installing torch.
-# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
-# TORCH_PATH = $(HOME)/torch
-# MXNET_PLUGINS += plugin/torch/torch.mk
-
-# WARPCTC_PATH = $(HOME)/warp-ctc
-# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
-
-# whether to use sframe integration. This requires build sframe
-# git@github.com:dato-code/SFrame.git
-# SFRAME_PATH = $(HOME)/SFrame
-# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/make/staticbuild/linux_cu91.mk b/make/staticbuild/linux_cu91.mk
deleted file mode 100644
index 89b35b10f6fa..000000000000
--- a/make/staticbuild/linux_cu91.mk
+++ /dev/null
@@ -1,180 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-#
-#-------------------------------------------------------------------------------
-#  Template configuration for compiling mxnet for making python wheel
-#-------------------------------------------------------------------------------
-
-#---------------------
-# choice of compiler
-#--------------------
-
-export CC = gcc
-export CXX = g++
-export NVCC = nvcc
-
-# whether compile with options for MXNet developer
-DEV = 0
-
-# whether compile with debug
-DEBUG = 0
-
-# whether to turn on signal handler (e.g. segfault logger)
-USE_SIGNAL_HANDLER = 1
-
-# the additional link flags you want to add
-ifdef USE_SYSTEM_CUDA
-ADD_LDFLAGS += -L$(DEPS_PATH)/lib -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
-else
-ADD_LDFLAGS += -L$(DEPS_PATH)/lib $(DEPS_PATH)/lib/libculibos.a -lpng -ltiff -ljpeg -lz -ldl -lgfortran -Wl,--version-script=$(CURDIR)/make/config/libmxnet.ver,-rpath,'$${ORIGIN}',--gc-sections
-endif
-
-# the additional compile flags you want to add
-ADD_CFLAGS += -I$(DEPS_PATH)/include -ffunction-sections -fdata-sections
-
-#---------------------------------------------
-# matrix computation libraries for CPU/GPU
-#---------------------------------------------
-
-# choose the version of blas you want to use
-# can be: mkl, blas, atlas, openblas
-# in default use atlas for linux while apple for osx
-USE_BLAS=openblas
-
-# whether use opencv during compilation
-# you can disable it, however, you will not able to use
-# imbin iterator
-USE_OPENCV = 1
-# Add OpenCV include path, in which the directory `opencv2` exists
-USE_OPENCV_INC_PATH = NONE
-# Add OpenCV shared library path, in which the shared library exists
-USE_OPENCV_LIB_PATH = NONE
-
-# whether use CUDA during compile
-USE_CUDA = 1
-
-# add the path to CUDA library to link and compile flag
-# if you have already add them to environment variable, leave it as NONE
-# USE_CUDA_PATH = /usr/local/cuda
-ifdef USE_SYSTEM_CUDA
-USE_CUDA_PATH = /usr/local/cuda-9.1
-else
-USE_CUDA_PATH = $(DEPS_PATH)/usr/local/cuda-9.1
-endif
-
-# whether to use CuDNN library
-USE_CUDNN = 1
-
-# whether to use NCCL library
-USE_NCCL = 1
-
-# CUDA architecture setting: going with all of them.
-# For CUDA < 6.0, comment the *_50 lines for compatibility.
-# CUDA_ARCH :=
-
-# whether use cuda runtime compiling for writing kernels in native language (i.e. Python)
-ENABLE_CUDA_RTC = 1
-
-USE_NVTX=1
-
-# use openmp for parallelization
-USE_OPENMP = 1
-USE_OPERATOR_TUNING = 1
-USE_LIBJPEG_TURBO = 1
-
-# whether use MKL-DNN library
-USE_MKLDNN = 1
-
-# whether use NNPACK library
-USE_NNPACK = 0
-
-# whether use lapack during compilation
-# only effective when compiled with blas versions openblas/apple/atlas/mkl
-USE_LAPACK = 1
-
-# path to lapack library in case of a non-standard installation
-USE_LAPACK_PATH = $(DEPS_PATH)/lib
-
-# add path to intel library, you may need it for MKL, if you did not add the path
-# to environment variable
-USE_INTEL_PATH = NONE
-
-# If use MKL, choose static link automatically to allow python wrapper
-ifeq ($(USE_BLAS), mkl)
-USE_STATIC_MKL = 1
-else
-USE_STATIC_MKL = NONE
-endif
-
-#----------------------------
-# Settings for power and arm arch
-#----------------------------
-ARCH := $(shell uname -a)
-ifneq (,$(filter $(ARCH), armv6l armv7l powerpc64le ppc64le aarch64))
-	USE_SSE=0
-else
-	USE_SSE=1
-endif
-
-#----------------------------
-# distributed computing
-#----------------------------
-
-# whether or not to enable multi-machine supporting
-USE_DIST_KVSTORE = 1
-
-# whether or not allow to read and write HDFS directly. If yes, then hadoop is
-# required
-USE_HDFS = 0
-
-# path to libjvm.so. required if USE_HDFS=1
-LIBJVM=$(JAVA_HOME)/jre/lib/amd64/server
-
-# whether or not allow to read and write AWS S3 directly. If yes, then
-# libcurl4-openssl-dev is required, it can be installed on Ubuntu by
-# sudo apt-get install -y libcurl4-openssl-dev
-USE_S3 = 1
-
-#----------------------------
-# additional operators
-#----------------------------
-
-# path to folders containing projects specific operators that you don't want to put in src/operators
-EXTRA_OPERATORS =
-
-
-#----------------------------
-# plugins
-#----------------------------
-
-# whether to use caffe integration. This requires installing caffe.
-# You also need to add CAFFE_PATH/build/lib to your LD_LIBRARY_PATH
-# CAFFE_PATH = $(HOME)/caffe
-# MXNET_PLUGINS += plugin/caffe/caffe.mk
-
-# whether to use torch integration. This requires installing torch.
-# You also need to add TORCH_PATH/install/lib to your LD_LIBRARY_PATH
-# TORCH_PATH = $(HOME)/torch
-# MXNET_PLUGINS += plugin/torch/torch.mk
-
-# WARPCTC_PATH = $(HOME)/warp-ctc
-# MXNET_PLUGINS += plugin/warpctc/warpctc.mk
-
-# whether to use sframe integration. This requires build sframe
-# git@github.com:dato-code/SFrame.git
-# SFRAME_PATH = $(HOME)/SFrame
-# MXNET_PLUGINS += plugin/sframe/plugin.mk
diff --git a/python/mxnet/ndarray/numpy/_op.py b/python/mxnet/ndarray/numpy/_op.py
index 824bc42ec6ea..f889bd7729a7 100644
--- a/python/mxnet/ndarray/numpy/_op.py
+++ b/python/mxnet/ndarray/numpy/_op.py
@@ -2697,41 +2697,6 @@ def fabs(x, out=None, **kwargs):
     return _pure_unary_func_helper(x, _api_internal.abs, _np.abs, out=out, **kwargs)
 
 
-@set_module('mxnet.ndarray.numpy')
-@wrap_np_unary_func
-def absolute(x, out=None, **kwargs):
-    r"""
-    Calculate the absolute value element-wise.
-
-    This function returns the absolute values (positive magnitude) of the
-    data in `x`. Complex values are not handled, use `absolute` to find the
-    absolute values of complex data.
-
-    Parameters
-    ----------
-    x : ndarray or scalar
-        Input array.
-    out : ndarray or None, optional
-        A location into which the result is stored. If provided, it must have
-        a shape that the inputs broadcast to. If not provided or `None`,
-        a freshly-allocated array is returned.
-
-    Returns
-    -------
-    absolute : ndarray
-        An ndarray containing the absolute value of
-        each element in `x`. This is a scalar if `x` is a scalar.
-
-    Examples
-    --------
-    >>> np.fabs(-1)
-    1.0
-    >>> np.fabs(np.array([-1.2, 1.2]))s
-    array([ 1.2,  1.2])
-    """
-    return _pure_unary_func_helper(x, _api_internal.abs, _np.abs, out=out, **kwargs)
-
-
 @set_module('mxnet.ndarray.numpy')
 @wrap_np_unary_func
 def absolute(x, out=None, **kwargs):
@@ -4046,7 +4011,6 @@ def transpose(a, axes=None):
     return _api_internal.transpose(a, axes)
 
 
-
 # pylint: disable=redefined-outer-name
 @set_module('mxnet.ndarray.numpy')
 def split(ary, indices_or_sections, axis=0):
@@ -7635,55 +7599,6 @@ def median(a, axis=None, out=None, overwrite_input=None, keepdims=False):
                     interpolation='midpoint', keepdims=keepdims)
 
 
-@set_module('mxnet.ndarray.numpy')
-def median(a, axis=None, out=None, overwrite_input=None, keepdims=False):
-    r"""
-    Compute the median along the specified axis.
-    Returns the median of the array elements.
-    Parameters
-    ----------
-    a : array_like
-        Input array or object that can be converted to an array.
-    axis : {int, sequence of int, None}, optional
-        Axis or axes along which the medians are computed. The default
-        is to compute the median along a flattened version of the array.
-        A sequence of axes is supported since version 1.9.0.
-    out : ndarray, optional
-        Alternative output array in which to place the result. It must
-        have the same shape and buffer length as the expected output,
-        but the type (of the output) will be cast if necessary.
-    keepdims : bool, optional
-        If this is set to True, the axes which are reduced are left
-        in the result as dimensions with size one. With this option,
-        the result will broadcast correctly against the original `arr`.
-    Returns
-    -------
-    median : ndarray
-        A new array holding the result. If the input contains integers
-        or floats smaller than ``float32``, then the output data-type is
-        ``np.float32``.  Otherwise, the data-type of the output is the
-        same as that of the input. If `out` is specified, that array is
-        returned instead.
-    See Also
-    --------
-    mean, percentile
-    Examples
-    --------
-    >>> a = np.array([[10, 7, 4], [3, 2, 1]])
-    >>> a
-    array([[10,  7,  4],
-        [ 3,  2,  1]])
-    >>> np.median(a)
-    3.5
-    >>> np.median(a, axis=0)
-    array([6.5, 4.5, 2.5])
-    >>> np.median(a, axis=1)
-    array([7.,  2.])
-    """
-    return quantile(a=a, q=0.5, axis=axis, out=out, overwrite_input=overwrite_input,
-                    interpolation='midpoint', keepdims=keepdims)
-
-
 @set_module('mxnet.ndarray.numpy')
 def quantile(a, q, axis=None, out=None, overwrite_input=None, interpolation='linear', keepdims=False): # pylint: disable=too-many-arguments
     """
@@ -8284,14 +8199,6 @@ def isnan(x, out=None, **kwargs):
     - ``out`` param: cannot perform auto type cast. ``out`` ndarray's dtype must be the same as the expected output.
     - ``out`` param does not support scalar input case.
 
-    This function differs from the original `numpy.where
-    <https://docs.scipy.org/doc/numpy/reference/generated/numpy.where.html>`_ in
-    the following way(s):
-
-    - If `condition` is a scalar, this operator returns x or y directly without broadcasting.
-    - If `condition` is ndarray, while both `x` and `y` are scalars,
-        the output dtype will be `float32`.
-
     Examples
     --------
     >>> np.isnan(np.nan)
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index a8353a701f15..f8cf69aaa1c4 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -5848,114 +5848,6 @@ def triu(m, k=0):
     return _mx_nd_np.triu(m, k)
 
 
-@set_module('mxnet.numpy')
-def arange(start, stop=None, step=1, dtype=None, ctx=None):
-    """Return evenly spaced values within a given interval.
-
-    Values are generated within the half-open interval ``[start, stop)``
-    (in other words, the interval including `start` but excluding `stop`).
-    For integer arguments the function is equivalent to the Python built-in
-    `range` function, but returns an ndarray rather than a list.
-
-    Parameters
-    ----------
-    n : int
-        The row dimension of the arrays for which the returned
-        indices will be valid.
-    k : int, optional
-        Diagonal offset (see `tril` for details).
-    m : int, optional
-        .. versionadded:: 1.9.0
-
-        The column dimension of the arrays for which the returned
-        arrays will be valid.
-        By default `m` is taken equal to `n`.
-
-    Returns
-    -------
-    inds : tuple of arrays
-        The indices for the triangle. The returned tuple contains two arrays,
-        each with the indices along one dimension of the array.
-
-    See also
-    --------
-    triu_indices : similar function, for upper-triangular.
-    mask_indices : generic function accepting an arbitrary mask function.
-    tril, triu
-
-    Examples
-    --------
-    Compute two different sets of indices to access 4x4 arrays, one for the
-    lower triangular part starting at the main diagonal, and one starting two
-    diagonals further right:
-
-    >>> il1 = np.tril_indices(4)
-    >>> il2 = np.tril_indices(4, 2)
-
-    Here is how they can be used with a sample array:
-
-    >>> a = np.arange(16).reshape(4, 4)
-    >>> a
-    array([[ 0,  1,  2,  3],
-           [ 4,  5,  6,  7],
-           [ 8,  9, 10, 11],
-           [12, 13, 14, 15]])
-
-    Both for indexing:
-
-    >>> a[il1]
-    array([ 0,  4,  5,  8,  9, 10, 12, 13, 14, 15])
-
-    And for assigning values:
-
-    >>> a[il1] = -1
-    >>> a
-    array([[-1,  1,  2,  3],
-           [-1, -1,  6,  7],
-           [-1, -1, -1, 11],
-           [-1, -1, -1, -1]])
-
-    These cover almost the whole array (two diagonals right of the main one):
-
-    >>> a[il2] = -10
-    >>> a
-    array([[-10, -10, -10,   3],
-           [-10, -10, -10, -10],
-           [-10, -10, -10, -10],
-           [-10, -10, -10, -10]])
-
-    """
-    if m is None:
-        m = n
-    return tuple(_mx_nd_np.tril_indices(n, k, m))
-
-
-# pylint: disable=redefined-outer-name
-@set_module('mxnet.numpy')
-def triu(m, k=0):
-    r"""
-    Upper triangle of an array.
-
-    Return a copy of a matrix with the elements below the `k`-th diagonal
-    zeroed.
-
-    Please refer to the documentation for `tril` for further details.
-
-    See Also
-    --------
-    tril : lower triangle of an array
-
-    Examples
-    --------
-    >>> np.triu(np.array([[1,2,3],[4,5,6],[7,8,9],[10,11,12]]), -1)
-    array([[ 1,  2,  3],
-           [ 4,  5,  6],
-           [ 0,  8,  9],
-           [ 0,  0, 12]])
-    """
-    return _mx_nd_np.triu(m, k)
-
-
 @set_module('mxnet.numpy')
 def arange(start, stop=None, step=1, dtype=None, ctx=None):
     """Return evenly spaced values within a given interval.
@@ -9012,72 +8904,6 @@ def roll(a, shift, axis=None):
     return _mx_nd_np.roll(a, shift, axis=axis)
 
 
-@set_module('mxnet.numpy')
-def rot90(m, k=1, axes=(0, 1)):
-    """
-    Roll array elements along a given axis.
-
-    Elements that roll beyond the last position are re-introduced at
-    the first.
-
-    Parameters
-    ----------
-    a : ndarray
-        Input array.
-    shift : int or tuple of ints
-        The number of places by which elements are shifted.  If a tuple,
-        then `axis` must be a tuple of the same size, and each of the
-        given axes is shifted by the corresponding number.  If an int
-        while `axis` is a tuple of ints, then the same value is used for
-        all given axes.
-    axis : int or tuple of ints, optional
-        Axis or axes along which elements are shifted.  By default, the
-        array is flattened before shifting, after which the original
-        shape is restored.
-
-    Returns
-    -------
-    res : ndarray
-        Output array, with the same shape as `a`.
-
-    Notes
-    -----
-    Supports rolling over multiple dimensions simultaneously.
-
-    Examples
-    --------
-    >>> x = np.arange(10)
-    >>> np.roll(x, 2)
-    array([8., 9., 0., 1., 2., 3., 4., 5., 6., 7.])
-    >>> np.roll(x, -2)
-    array([2., 3., 4., 5., 6., 7., 8., 9., 0., 1.])
-
-    >>> x2 = np.reshape(x, (2,5))
-    >>> x2
-    array([[0., 1., 2., 3., 4.],
-           [5., 6., 7., 8., 9.]])
-    >>> np.roll(x2, 1)
-    array([[9., 0., 1., 2., 3.],
-           [4., 5., 6., 7., 8.]])
-    >>> np.roll(x2, -1)
-    array([[1., 2., 3., 4., 5.],
-           [6., 7., 8., 9., 0.]])
-    >>> np.roll(x2, 1, axis=0)
-    array([[5., 6., 7., 8., 9.],
-           [0., 1., 2., 3., 4.]])
-    >>> np.roll(x2, -1, axis=0)
-    array([[5., 6., 7., 8., 9.],
-           [0., 1., 2., 3., 4.]])
-    >>> np.roll(x2, 1, axis=1)
-    array([[4., 0., 1., 2., 3.],
-           [9., 5., 6., 7., 8.]])
-    >>> np.roll(x2, -1, axis=1)
-    array([[1., 2., 3., 4., 0.],
-           [6., 7., 8., 9., 5.]])
-   """
-    return _mx_nd_np.roll(a, shift, axis=axis)
-
-
 @set_module('mxnet.numpy')
 def rot90(m, k=1, axes=(0, 1)):
     """
diff --git a/scala-package/examples/scripts/module/mnist_mlp.sh b/scala-package/examples/scripts/module/mnist_mlp.sh
index 41d019b1473b..907552a45b46 100755
--- a/scala-package/examples/scripts/module/mnist_mlp.sh
+++ b/scala-package/examples/scripts/module/mnist_mlp.sh
@@ -1,3 +1,5 @@
+#!/bin/bash
+
 # Licensed to the Apache Software Foundation (ASF) under one
 # or more contributor license agreements.  See the NOTICE file
 # distributed with this work for additional information
@@ -14,11 +16,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-"""Common runtime ctypes.
-Acknowledgement: This file originates from incubator-tvm
-"""
-# pylint: disable=invalid-name
-import ctypes
 
 ROOT_DIR=$(cd `dirname $0`/../../..; pwd)
 CLASSPATH=$ROOT_DIR/assembly/target/*:$ROOT_DIR/examples/target/*:$ROOT_DIR/examples/target/classes/lib/*
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index 395a1e124284..7309327d27c3 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -1818,7 +1818,7 @@ def test_shapes():
             a = np.ones((2,), dtype=dt)
             b = np.ones((2,), dtype=dt)
             OpArgMngr.add_workload('matmul', a, b)
-
+    
     def test_result_types():
         mat = np.ones((1,1))
         vec = np.ones((1,))
@@ -1827,7 +1827,7 @@ def test_result_types():
             v = vec.astype(dt)
             for arg in [(m, v), (v, m), (m, m)]:
                 OpArgMngr.add_workload('matmul', *arg)
-
+    
     def test_scalar_output():
         vec1 = np.array([2])
         vec2 = np.array([3, 4]).reshape(1, -1)
@@ -1836,7 +1836,7 @@ def test_scalar_output():
             v2 = vec2.astype(dt)
             OpArgMngr.add_workload('matmul', v1, v2)
             OpArgMngr.add_workload('matmul', v2.T, v1)
-
+    
     def test_vector_vector_values():
         vec1 = np.array([1, 2])
         vec2 = np.array([3, 4]).reshape(-1, 1)
@@ -1868,7 +1868,7 @@ def test_matrix_vector_values():
             m2 = mat2.astype(dt)
             OpArgMngr.add_workload('matmul', m1, v)
             OpArgMngr.add_workload('matmul', m2, v)
-
+    
     def test_matrix_matrix_values():
         mat1 = np.array([[1, 2], [3, 4]])
         mat2 = np.array([[1, 0], [1, 1]])
diff --git a/tests/python/unittest/test_numpy_op.py b/tests/python/unittest/test_numpy_op.py
index c1930e0dbf0f..be759f6fa3d5 100644
--- a/tests/python/unittest/test_numpy_op.py
+++ b/tests/python/unittest/test_numpy_op.py
@@ -5798,7 +5798,7 @@ class TestLstsq(HybridBlock):
         def __init__(self, rcond):
             super(TestLstsq, self).__init__()
             self._rcond = rcond
-
+        
         def hybrid_forward(self, F, a, b, rcond='warn'):
             return F.np.linalg.lstsq(a, b, rcond=self._rcond)
 
@@ -7195,10 +7195,10 @@ def __init__(self, n, k=0, m=None):
             if m is None:
                 m = n
             self._m = m
-
+        
         def hybrid_forward(self, F, x, *args, **kwargs):
             return x, F.np.tril_indices(n=self._n, k=self._k, m=self._m)
-
+    
     for n in _np.random.random_integers(-10, 50, 2):
         for k in _np.random.random_integers(-50, 50, 2):
             for m in _np.random.random_integers(-10, 50, 2):
@@ -7219,7 +7219,7 @@ def hybrid_forward(self, F, x, *args, **kwargs):
                         np_data[np_out] = -10
                         mx_data[mx_out] = -10
                         assert same(np_data, mx_data.asnumpy())
-
+                        
 
 @with_seed()
 @use_np
@@ -7463,56 +7463,6 @@ def hybrid_forward(self, F, a, *args, **kwargs):
                     assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
 
 
-@with_seed()
-@use_np
-def test_np_einsum():
-    class TestEinsum(HybridBlock):
-        def __init__(self, subscripts, optimize):
-            super(TestEinsum, self).__init__()
-            self.subscripts = subscripts
-            self.optimize = optimize
-
-        def hybrid_forward(self, F, a, *args, **kwargs):
-            return F.np.dsplit(a, indices_or_sections=self._indices_or_sections)
-
-    shapes = [
-        (2, 4, 6),
-        (3, 0, 6),
-        (2, 3, 0, 4),
-    ]
-    indices_or_sections_num = [
-        (2, 4),
-        (3, 3),
-        (3,),
-        (1,),
-        2,
-    ]
-    for hybridize in [True, False]:
-        for shape in shapes:
-            for indices_or_sections in indices_or_sections_num:
-                # test gluon
-                test_dsplit = TestDSplit(indices_or_sections=indices_or_sections)
-                if hybridize:
-                    test_dsplit.hybridize()
-
-                a = mx.nd.random.uniform(-1.0, 1.0, shape=shape).as_np_ndarray()
-                a.attach_grad()
-                expected_ret = _np.dsplit(a.asnumpy(), indices_or_sections=indices_or_sections)
-                with mx.autograd.record():
-                    y = test_dsplit(a)
-                assert len(y) == len(expected_ret)
-                for mx_out, np_out in zip(y, expected_ret):
-                    assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
-                mx.autograd.backward(y)
-                assert_almost_equal(a.grad.asnumpy(), _np.ones(a.shape), rtol=1e-3, atol=1e-5)
-
-                # test imperative
-                mx_outs = np.dsplit(a, indices_or_sections=indices_or_sections)
-                np_outs = _np.dsplit(a.asnumpy(), indices_or_sections=indices_or_sections)
-                for mx_out, np_out in zip(mx_outs, np_outs):
-                    assert_almost_equal(mx_out.asnumpy(), np_out, rtol=1e-3, atol=1e-5)
-
-
 @with_seed()
 @use_np
 def test_np_einsum():
@@ -8029,7 +7979,7 @@ def hybrid_forward(self, F, a):
         a = np.random.uniform(-1.0, 1.0, size=a_shape)
         np_out = _np.median(a.asnumpy(), axis=axis, keepdims=keepdims)
         mx_out = test_median(a)
-
+        
         assert mx_out.shape == np_out.shape
         assert_almost_equal(mx_out.asnumpy(), np_out, atol=atol, rtol=rtol)
 
@@ -9046,10 +8996,10 @@ def __init__(self, left=None, right=None, period=None):
             self._left = left
             self._right = right
             self._period = period
-
+        
         def hybrid_forward(self, F, x, xp, fp):
             return F.np.interp(x, xp, fp, left=self._left, right=self._right, period=self._period)
-
+    
     class TestInterpScalar(HybridBlock):
         def __init__(self, x=None, left=None, right=None, period=None):
             super(TestInterpScalar, self).__init__()
@@ -9057,7 +9007,7 @@ def __init__(self, x=None, left=None, right=None, period=None):
             self._left = left
             self._right = right
             self._period = period
-
+        
         def hybrid_forward(self, F, xp, fp):
             return F.np.interp(self._x, xp, fp, left=self._left, right=self._right, period=self._period)
 
@@ -9084,13 +9034,13 @@ def hybrid_forward(self, F, xp, fp):
         else:
             x = np.random.uniform(0, 100, size=xshape).astype(xtype)
             xp = np.sort(np.random.choice(100, dsize, replace=False).astype(dtype))
-            fp = np.random.uniform(-50, 50, size=dsize).astype(dtype)
+            fp = np.random.uniform(-50, 50, size=dsize).astype(dtype) 
         np_x = x.asnumpy()
         if x_scalar and xshape == ():
             x = x.item()
             np_x = x
             test_interp = TestInterpScalar(x=x, left=left, right=right, period=period)
-        else:
+        else: 
             test_interp = TestInterp(left=left, right=right, period=period)
         if hybridize:
             test_interp.hybridize()
@@ -9478,7 +9428,7 @@ def __init__(self, axis=0, start=0):
             super(TestRollaxis, self).__init__()
             self._axis = axis
             self._start = start
-
+             
         def hybrid_forward(self, F, a, *args, **kwargs):
             return F.np.rollaxis(a, axis=self._axis, start=self._start)
 
diff --git a/tools/pip/doc/CU90_ADDITIONAL.md b/tools/pip/doc/CU90_ADDITIONAL.md
deleted file mode 100644
index 3f51c50520f2..000000000000
--- a/tools/pip/doc/CU90_ADDITIONAL.md
+++ /dev/null
@@ -1,47 +0,0 @@
-<!--- Licensed to the Apache Software Foundation (ASF) under one -->
-<!--- or more contributor license agreements.  See the NOTICE file -->
-<!--- distributed with this work for additional information -->
-<!--- regarding copyright ownership.  The ASF licenses this file -->
-<!--- to you under the Apache License, Version 2.0 (the -->
-<!--- "License"); you may not use this file except in compliance -->
-<!--- with the License.  You may obtain a copy of the License at -->
-
-<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
-
-<!--- Unless required by applicable law or agreed to in writing, -->
-<!--- software distributed under the License is distributed on an -->
-<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
-<!--- KIND, either express or implied.  See the License for the -->
-<!--- specific language governing permissions and limitations -->
-<!--- under the License. -->
-
-**CUDA 9.0 package for MXNet is no longer maintained for new releases.**
-
-Prerequisites
--------------
-This package supports Linux and Windows platforms. You may also want to check:
-- [mxnet-cu102](https://pypi.python.org/pypi/mxnet-cu102/) with CUDA-10.2 support.
-- [mxnet-cu101](https://pypi.python.org/pypi/mxnet-cu101/) with CUDA-10.1 support.
-- [mxnet-cu100](https://pypi.python.org/pypi/mxnet-cu100/) with CUDA-10.0 support.
-- [mxnet-cu92](https://pypi.python.org/pypi/mxnet-cu92/) with CUDA-9.2 support.
-- [mxnet-cu90](https://pypi.python.org/pypi/mxnet-cu90/) with CUDA-9.0 support.
-- [mxnet](https://pypi.python.org/pypi/mxnet/).
-- [mxnet-native](https://pypi.python.org/pypi/mxnet-native/) CPU variant without MKLDNN.
-
-To download CUDA, check [CUDA download](https://developer.nvidia.com/cuda-downloads). For more instructions, check [CUDA Toolkit online documentation](http://docs.nvidia.com/cuda/index.html).
-
-To install for other platforms (e.g. Windows, Raspberry Pi/ARM) or other versions, check [Installing MXNet](https://mxnet.apache.org/versions/master/install/index.html) for instructions on building from source.
-
-Installation
-------------
-To install:
-```bash
-pip install mxnet-cu90
-```
-
-Nightly Builds
---------------
-To install the latest nightly build, use:
-```bash
-pip install --pre mxnet-cu90 -f https://dist.mxnet.io/python
-```
diff --git a/tools/staticbuild/README.md b/tools/staticbuild/README.md
index b06f3b9a5a05..077150e11762 100644
--- a/tools/staticbuild/README.md
+++ b/tools/staticbuild/README.md
@@ -27,7 +27,7 @@ environment variable settings. Here are examples you can run with this script:
 ```
 tools/staticbuild/build.sh cu102
 ```
-This would build the mxnet package based on CUDA 10.2. Currently, we support variants cpu, native, cu92, cu100, cu101 and cu102. All of these variants expect native have MKL-DNN backend enabled.
+This would build the mxnet package based on CUDA 10.2. Currently, we support variants cpu, native, cu92, cu100, cu101 and cu102. All of these variants expect native have MKL-DNN backend enabled. 
 
 ```
 tools/staticbuild/build.sh cpu

From 7996fe26838c8656cfcf0a93e2102a4c41bd97c6 Mon Sep 17 00:00:00 2001
From: "Joshua Z. Zhang" <cheungchih@gmail.com>
Date: Fri, 17 Apr 2020 13:50:54 -0700
Subject: [PATCH 03/17] fix error

---
 .../contrib/data/vision/transforms/bbox/__init__.py   | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/python/mxnet/gluon/contrib/data/vision/transforms/bbox/__init__.py b/python/mxnet/gluon/contrib/data/vision/transforms/bbox/__init__.py
index c3496b7086ef..e728064e02b1 100644
--- a/python/mxnet/gluon/contrib/data/vision/transforms/bbox/__init__.py
+++ b/python/mxnet/gluon/contrib/data/vision/transforms/bbox/__init__.py
@@ -14,13 +14,4 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-all: html
-
-html:
-	mkdir -p build/html
-	doxygen Doxyfile
-
-
-clean:
-	rm -rf build
+from bbox import *

From 07c02f85ee7f91bad9561e80f52b30fb7305d443 Mon Sep 17 00:00:00 2001
From: "Joshua Z. Zhang" <cheungchih@gmail.com>
Date: Fri, 17 Apr 2020 13:55:50 -0700
Subject: [PATCH 04/17] fix import error

---
 .../mxnet/gluon/contrib/data/vision/transforms/bbox/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/mxnet/gluon/contrib/data/vision/transforms/bbox/__init__.py b/python/mxnet/gluon/contrib/data/vision/transforms/bbox/__init__.py
index e728064e02b1..c9649255f757 100644
--- a/python/mxnet/gluon/contrib/data/vision/transforms/bbox/__init__.py
+++ b/python/mxnet/gluon/contrib/data/vision/transforms/bbox/__init__.py
@@ -14,4 +14,4 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-from bbox import *
+from .bbox import *

From 318d7aead184603ed74d69b30386aed06c0c98e2 Mon Sep 17 00:00:00 2001
From: Joshua Zhang <cheungchih@gmail.com>
Date: Sat, 18 Apr 2020 00:15:59 -0700
Subject: [PATCH 05/17] fix ci build

---
 ci/build_windows.py                                         | 1 +
 .../gluon/contrib/data/vision/transforms/bbox/__init__.py   | 1 +
 src/io/iter_sampler.cc                                      | 1 +
 tests/python/unittest/test_gluon_data.py                    | 6 +++---
 4 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/ci/build_windows.py b/ci/build_windows.py
index c8d3af515b5a..973534e5c0de 100755
--- a/ci/build_windows.py
+++ b/ci/build_windows.py
@@ -202,6 +202,7 @@ def windows_build(args):
             if ret != 0:
                 build_try += 1
                 logging.info("{} build(s) have failed".format(build_try))
+                sys.exit(1)
             else:
                 logging.info("Build flavour: {} complete in directory: \"{}\"".format(args.flavour, os.path.abspath(path)))
                 logging.info("Build took {}".format(datetime.timedelta(seconds=int(time.time() - t0))))
diff --git a/python/mxnet/gluon/contrib/data/vision/transforms/bbox/__init__.py b/python/mxnet/gluon/contrib/data/vision/transforms/bbox/__init__.py
index c9649255f757..443dbcdb1e09 100644
--- a/python/mxnet/gluon/contrib/data/vision/transforms/bbox/__init__.py
+++ b/python/mxnet/gluon/contrib/data/vision/transforms/bbox/__init__.py
@@ -14,4 +14,5 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+"""Gluon contrib vision bbox transform"""
 from .bbox import *
diff --git a/src/io/iter_sampler.cc b/src/io/iter_sampler.cc
index daf3f1c8fec4..8566d1c983e3 100644
--- a/src/io/iter_sampler.cc
+++ b/src/io/iter_sampler.cc
@@ -27,6 +27,7 @@
 #include <mxnet/io.h>
 #include <mxnet/base.h>
 #include <mxnet/resource.h>
+#include <numeric>
 #include "../common/utils.h"
 #include "./iter_batchloader.h"
 #include "./iter_prefetcher.h"
diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index ab9d448f6044..06481e69bc16 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -101,7 +101,7 @@ def _dataset_transform_first_fn(x):
 def test_recordimage_dataset_with_data_loader_multiworker():
     recfile = prepare_record()
     dataset = gluon.data.vision.ImageRecordDataset(recfile)
-    loader = gluon.data.DataLoader(dataset, 1, num_workers=5, try_nopython=True)
+    loader = gluon.data.DataLoader(dataset, 1, num_workers=5, try_nopython=False)
 
     for i, (x, y) in enumerate(loader):
         assert x.shape[0] == 1 and x.shape[3] == 3
@@ -109,7 +109,7 @@ def test_recordimage_dataset_with_data_loader_multiworker():
 
     # with transform
     dataset = gluon.data.vision.ImageRecordDataset(recfile).transform(_dataset_transform_fn)
-    loader = gluon.data.DataLoader(dataset, 1, num_workers=5, try_nopython=False)
+    loader = gluon.data.DataLoader(dataset, 1, num_workers=5, try_nopython=True)
 
     for i, (x, y) in enumerate(loader):
         assert x.shape[0] == 1 and x.shape[3] == 3
@@ -117,7 +117,7 @@ def test_recordimage_dataset_with_data_loader_multiworker():
 
     # with transform_first
     dataset = gluon.data.vision.ImageRecordDataset(recfile).transform_first(_dataset_transform_first_fn)
-    loader = gluon.data.DataLoader(dataset, 1, num_workers=5, try_nopython=False)
+    loader = gluon.data.DataLoader(dataset, 1, num_workers=5, try_nopython=None)
 
     for i, (x, y) in enumerate(loader):
         assert x.shape[0] == 1 and x.shape[3] == 3

From 5d7e0e048a85e1f8d3da5000333e414d0bc4723e Mon Sep 17 00:00:00 2001
From: "Joshua Z. Zhang" <cheungchih@gmail.com>
Date: Sat, 18 Apr 2020 14:40:58 -0700
Subject: [PATCH 06/17] fix vs openmp loop type

---
 src/io/batchify.cc   | 4 ++--
 src/io/dataloader.cc | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/io/batchify.cc b/src/io/batchify.cc
index ed61c742245e..8cb71a3fe84e 100644
--- a/src/io/batchify.cc
+++ b/src/io/batchify.cc
@@ -150,7 +150,7 @@ class StackBatchify : public BatchifyFunction {
         }
         MSHADOW_TYPE_SWITCH_WITH_BOOL(dtype, DType, {
           _Pragma("omp parallel for num_threads(bs)")
-          for (size_t j = 0; j < bs; ++j) {
+          for (int j = 0; j < bs; ++j) {
             omp_exc_.Run([&] {
               // inputs[j][i].WaitToRead();
               DType *ptr = (*outputs)[i].data().dptr<DType>();
@@ -275,7 +275,7 @@ class PadBatchify : public BatchifyFunction {
           DType *ptr = (*outputs)[i].data().dptr<DType>();
           auto asize = ashape.Size();
           _Pragma("omp parallel for num_threads(bs)")
-          for (size_t j = 0; j < bs; ++j) {
+          for (int j = 0; j < bs; ++j) {
             using namespace mshadow::expr;
             auto compact_shapes = CompactShapes(ashape, inputs[j][i].shape());
             // inputs[j][i].WaitToRead();
diff --git a/src/io/dataloader.cc b/src/io/dataloader.cc
index 5b47b8c01809..589c3b24c120 100644
--- a/src/io/dataloader.cc
+++ b/src/io/dataloader.cc
@@ -123,7 +123,7 @@ class ThreadedDataLoader : public IIterator<TBlobBatch> {
       profiler::CustomOpProfiler::Get()->OnCustomBegin("MXThreadedDataLoaderGetItems");
     }
     #pragma omp parallel for num_threads(param_.num_workers)
-    for (size_t i = 0; i < real_batch_size; ++i) {
+    for (int i = 0; i < real_batch_size; ++i) {
       omp_exc_.Run([&] {
         auto idx = idx_ptrs[i];
         CHECK(datasets_[i % param_.num_workers]->GetItem(idx, &inputs[i]))

From 3d3177ffee118c175837b0348345f59ea0fd3d55 Mon Sep 17 00:00:00 2001
From: Joshua Zhang <cheungchih@gmail.com>
Date: Sat, 18 Apr 2020 21:52:23 -0700
Subject: [PATCH 07/17] fix warning as error with sign/unsign comp

---
 src/io/batchify.cc | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/io/batchify.cc b/src/io/batchify.cc
index 8cb71a3fe84e..72a3ed4e0cf5 100644
--- a/src/io/batchify.cc
+++ b/src/io/batchify.cc
@@ -148,9 +148,10 @@ class StackBatchify : public BatchifyFunction {
         } else {
           (*outputs)[i] = NDArray(sshape, mxnet::Context::CPU(0), false, inputs[0][i].dtype());
         }
+        int sbs = static_cast<int>(bs);
         MSHADOW_TYPE_SWITCH_WITH_BOOL(dtype, DType, {
           _Pragma("omp parallel for num_threads(bs)")
-          for (int j = 0; j < bs; ++j) {
+          for (int j = 0; j < sbs; ++j) {
             omp_exc_.Run([&] {
               // inputs[j][i].WaitToRead();
               DType *ptr = (*outputs)[i].data().dptr<DType>();
@@ -274,8 +275,9 @@ class PadBatchify : public BatchifyFunction {
                     static_cast<DType>(param_.pad_val));
           DType *ptr = (*outputs)[i].data().dptr<DType>();
           auto asize = ashape.Size();
+          int sbs = static_cast<int>(bs);
           _Pragma("omp parallel for num_threads(bs)")
-          for (int j = 0; j < bs; ++j) {
+          for (int j = 0; j < sbs; ++j) {
             using namespace mshadow::expr;
             auto compact_shapes = CompactShapes(ashape, inputs[j][i].shape());
             // inputs[j][i].WaitToRead();

From b8048e4a967f9059df11a7e24ff8c95118ab22e3 Mon Sep 17 00:00:00 2001
From: "Joshua Z. Zhang" <cheungchih@gmail.com>
Date: Sun, 19 Apr 2020 14:57:11 -0700
Subject: [PATCH 08/17] sign/unsign comp

---
 src/io/dataloader.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/io/dataloader.cc b/src/io/dataloader.cc
index 589c3b24c120..2ddd8cf73d89 100644
--- a/src/io/dataloader.cc
+++ b/src/io/dataloader.cc
@@ -109,7 +109,7 @@ class ThreadedDataLoader : public IIterator<TBlobBatch> {
     if (!has_next) return false;
     auto samples = sampler_->Value();
     auto batch_size = samples.data[0].shape().Size();
-    auto real_batch_size = batch_size - samples.num_batch_padd;
+    int real_batch_size = batch_size - samples.num_batch_padd;
     const int64_t *idx_ptr = static_cast<int64_t*>(
         samples.data[0].data().dptr_);
     std::vector<int64_t> idx_ptrs;

From e5cfbfcc7fb985b9c6655585c665994a159ebbbe Mon Sep 17 00:00:00 2001
From: "Joshua Z. Zhang" <cheungchih@gmail.com>
Date: Thu, 23 Apr 2020 17:00:41 -0700
Subject: [PATCH 09/17] update to pytest

---
 ci/docker/Dockerfile.build.test.armv7         |  2 +-
 ci/docker/Dockerfile.build.test.armv8         |  2 +-
 .../test_contrib_gluon_data_vision.py         | 30 ++++++++-----------
 .../test_numpy_contrib_gluon_data_vision.py   | 30 ++++++++-----------
 4 files changed, 28 insertions(+), 36 deletions(-)

diff --git a/ci/docker/Dockerfile.build.test.armv7 b/ci/docker/Dockerfile.build.test.armv7
index 711bf651491f..066040c5be8f 100644
--- a/ci/docker/Dockerfile.build.test.armv7
+++ b/ci/docker/Dockerfile.build.test.armv7
@@ -47,4 +47,4 @@ COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
-WORKDIR /work/mxnet
\ No newline at end of file
+WORKDIR /work/mxnet
diff --git a/ci/docker/Dockerfile.build.test.armv8 b/ci/docker/Dockerfile.build.test.armv8
index 920211440ef6..7a77c78bbeea 100644
--- a/ci/docker/Dockerfile.build.test.armv8
+++ b/ci/docker/Dockerfile.build.test.armv8
@@ -45,4 +45,4 @@ COPY install/ubuntu_adduser.sh /work/
 RUN /work/ubuntu_adduser.sh
 
 COPY runtime_functions.sh /work/
-WORKDIR /work/mxnet
\ No newline at end of file
+WORKDIR /work/mxnet
diff --git a/tests/python/unittest/test_contrib_gluon_data_vision.py b/tests/python/unittest/test_contrib_gluon_data_vision.py
index 9a95d202bab7..29ef3e69b92d 100644
--- a/tests/python/unittest/test_contrib_gluon_data_vision.py
+++ b/tests/python/unittest/test_contrib_gluon_data_vision.py
@@ -54,27 +54,23 @@ def _generate_objects():
 
 class TestImage(unittest.TestCase):
     IMAGES_URL = "http://data.mxnet.io/data/test_images.tar.gz"
-    IMAGES = []
-    IMAGES_DIR = None
 
-    @classmethod
-    def setupClass(cls):
-        cls.IMAGES_DIR = tempfile.mkdtemp()
-        cls.IMAGES = _get_data(cls.IMAGES_URL, cls.IMAGES_DIR)
-        print("Loaded {} images".format(len(cls.IMAGES)))
+    def setUp(self):
+        self.IMAGES_DIR = tempfile.mkdtemp()
+        self.IMAGES = _get_data(self.IMAGES_URL, self.IMAGES_DIR)
+        print("Loaded {} images".format(len(self.IMAGES)))
 
-    @classmethod
-    def teardownClass(cls):
-        if cls.IMAGES_DIR:
-            print("cleanup {}".format(cls.IMAGES_DIR))
-            shutil.rmtree(cls.IMAGES_DIR)
+    def tearDown(self):
+        if self.IMAGES_DIR:
+            print("cleanup {}".format(self.IMAGES_DIR))
+            shutil.rmtree(self.IMAGES_DIR)
 
     def test_imageiter(self):
-        im_list = [[np.random.randint(0, 5), x] for x in TestImage.IMAGES]
+        im_list = [[np.random.randint(0, 5), x] for x in self.IMAGES]
         os.makedirs('./data', exist_ok=True)
         fname = './data/test_imageiter.lst'
         file_list = ['\t'.join([str(k), str(np.random.randint(0, 5)), x])
-                        for k, x in enumerate(TestImage.IMAGES)]
+                        for k, x in enumerate(self.IMAGES)]
         with open(fname, 'w') as f:
             for line in file_list:
                 f.write(line + '\n')
@@ -102,7 +98,7 @@ def test_imageiter(self):
                         pass
 
     def test_image_bbox_iter(self):
-        im_list = [_generate_objects() + [x] for x in TestImage.IMAGES]
+        im_list = [_generate_objects() + [x] for x in self.IMAGES]
         det_iter = mx.gluon.contrib.data.vision.ImageBboxDataLoader(2, (3, 300, 300), imglist=im_list, path_root='')
         for _ in range(3):
             for _ in det_iter:
@@ -117,7 +113,7 @@ def test_image_bbox_iter(self):
         # test file list with last batch handle
         os.makedirs('./data', exist_ok=True)
         fname = './data/test_imagedetiter.lst'
-        im_list = [[k] + _generate_objects() + [x] for k, x in enumerate(TestImage.IMAGES)]
+        im_list = [[k] + _generate_objects() + [x] for k, x in enumerate(self.IMAGES)]
         with open(fname, 'w') as f:
             for line in im_list:
                 line = '\t'.join([str(k) for k in line])
@@ -140,7 +136,7 @@ def test_image_bbox_iter(self):
     def test_bbox_augmenters(self):
         # only test if all augmenters will work
         # TODO(Joshua Zhang): verify the augmenter outputs
-        im_list = [_generate_objects() + [x] for x in TestImage.IMAGES]
+        im_list = [_generate_objects() + [x] for x in self.IMAGES]
         det_iter = mx.gluon.contrib.data.vision.ImageBboxDataLoader(2, (3, 300, 300), imglist=im_list, path_root='',
             rand_crop=1, rand_pad=1, rand_gray=0.1, rand_mirror=True, mean=True,
             std=[1.1, 1.03, 1.05], brightness=0.1, contrast=0.1, saturation=0.1,
diff --git a/tests/python/unittest/test_numpy_contrib_gluon_data_vision.py b/tests/python/unittest/test_numpy_contrib_gluon_data_vision.py
index 844138fefde8..47509e84d053 100644
--- a/tests/python/unittest/test_numpy_contrib_gluon_data_vision.py
+++ b/tests/python/unittest/test_numpy_contrib_gluon_data_vision.py
@@ -54,27 +54,23 @@ def _generate_objects():
 
 class TestImage(unittest.TestCase):
     IMAGES_URL = "http://data.mxnet.io/data/test_images.tar.gz"
-    IMAGES = []
-    IMAGES_DIR = None
 
-    @classmethod
-    def setupClass(cls):
-        cls.IMAGES_DIR = tempfile.mkdtemp()
-        cls.IMAGES = _get_data(cls.IMAGES_URL, cls.IMAGES_DIR)
-        print("Loaded {} images".format(len(cls.IMAGES)))
+    def setUp(self):
+        self.IMAGES_DIR = tempfile.mkdtemp()
+        self.IMAGES = _get_data(self.IMAGES_URL, self.IMAGES_DIR)
+        print("Loaded {} images".format(len(self.IMAGES)))
 
-    @classmethod
-    def teardownClass(cls):
-        if cls.IMAGES_DIR:
-            print("cleanup {}".format(cls.IMAGES_DIR))
-            shutil.rmtree(cls.IMAGES_DIR)
+    def tearDown(self):
+        if self.IMAGES_DIR:
+            print("cleanup {}".format(self.IMAGES_DIR))
+            shutil.rmtree(self.IMAGES_DIR)
 
     @use_np
     def test_imageiter(self):
-        im_list = [[np.random.randint(0, 5), x] for x in TestImage.IMAGES]
+        im_list = [[np.random.randint(0, 5), x] for x in self.IMAGES]
         fname = './data/test_imageiter.lst'
         file_list = ['\t'.join([str(k), str(np.random.randint(0, 5)), x])
-                        for k, x in enumerate(TestImage.IMAGES)]
+                        for k, x in enumerate(self.IMAGES)]
         with open(fname, 'w') as f:
             for line in file_list:
                 f.write(line + '\n')
@@ -103,7 +99,7 @@ def test_imageiter(self):
 
     @use_np
     def test_image_bbox_iter(self):
-        im_list = [_generate_objects() + [x] for x in TestImage.IMAGES]
+        im_list = [_generate_objects() + [x] for x in self.IMAGES]
         det_iter = mx.gluon.contrib.data.vision.ImageBboxDataLoader(2, (3, 300, 300), imglist=im_list, path_root='')
         for _ in range(3):
             for _ in det_iter:
@@ -117,7 +113,7 @@ def test_image_bbox_iter(self):
 
         # test file list with last batch handle
         fname = './data/test_imagedetiter.lst'
-        im_list = [[k] + _generate_objects() + [x] for k, x in enumerate(TestImage.IMAGES)]
+        im_list = [[k] + _generate_objects() + [x] for k, x in enumerate(self.IMAGES)]
         with open(fname, 'w') as f:
             for line in im_list:
                 line = '\t'.join([str(k) for k in line])
@@ -140,7 +136,7 @@ def test_image_bbox_iter(self):
     def test_bbox_augmenters(self):
         # only test if all augmenters will work
         # TODO(Joshua Zhang): verify the augmenter outputs
-        im_list = [_generate_objects() + [x] for x in TestImage.IMAGES]
+        im_list = [_generate_objects() + [x] for x in self.IMAGES]
         det_iter = mx.gluon.contrib.data.vision.ImageBboxDataLoader(2, (3, 300, 300), imglist=im_list, path_root='',
             rand_crop=1, rand_pad=1, rand_gray=0.1, rand_mirror=True, mean=True,
             std=[1.1, 1.03, 1.05], brightness=0.1, contrast=0.1, saturation=0.1,

From 35d9a67db4b503d128e9fbcbbb91e2d3ebb53643 Mon Sep 17 00:00:00 2001
From: Joshua Zhang <cheungchih@gmail.com>
Date: Sat, 25 Apr 2020 15:31:02 -0700
Subject: [PATCH 10/17] remove nose

---
 tests/python/unittest/test_contrib_gluon_data_vision.py       | 2 --
 tests/python/unittest/test_numpy_contrib_gluon_data_vision.py | 2 --
 tests/python/unittest/test_numpy_gluon_data_vision.py         | 4 ----
 3 files changed, 8 deletions(-)

diff --git a/tests/python/unittest/test_contrib_gluon_data_vision.py b/tests/python/unittest/test_contrib_gluon_data_vision.py
index 29ef3e69b92d..d2e38d66cb20 100644
--- a/tests/python/unittest/test_contrib_gluon_data_vision.py
+++ b/tests/python/unittest/test_contrib_gluon_data_vision.py
@@ -24,8 +24,6 @@
 import tempfile
 import unittest
 
-from nose.tools import raises
-
 def _get_data(url, dirname):
     import os, tarfile
     download(url, dirname=dirname, overwrite=False)
diff --git a/tests/python/unittest/test_numpy_contrib_gluon_data_vision.py b/tests/python/unittest/test_numpy_contrib_gluon_data_vision.py
index 47509e84d053..14f5f95cd8f7 100644
--- a/tests/python/unittest/test_numpy_contrib_gluon_data_vision.py
+++ b/tests/python/unittest/test_numpy_contrib_gluon_data_vision.py
@@ -24,8 +24,6 @@
 import tempfile
 import unittest
 
-from nose.tools import raises
-
 def _get_data(url, dirname):
     import os, tarfile
     download(url, dirname=dirname, overwrite=False)
diff --git a/tests/python/unittest/test_numpy_gluon_data_vision.py b/tests/python/unittest/test_numpy_gluon_data_vision.py
index 3b76cf4e78ff..ff8c633153c1 100644
--- a/tests/python/unittest/test_numpy_gluon_data_vision.py
+++ b/tests/python/unittest/test_numpy_gluon_data_vision.py
@@ -406,7 +406,3 @@ def test_bbox_crop():
     im_out, im_bbox = transform(img, bbox)
     assert im_out.shape == (3, 3, 3)
     assert im_bbox[0][2] == 3
-
-if __name__ == '__main__':
-    import nose
-    nose.runmodule()

From e559bd8a3a57885996f914a3452da76adda9bfa4 Mon Sep 17 00:00:00 2001
From: Joshua Zhang <cheungchih@gmail.com>
Date: Sat, 25 Apr 2020 18:04:02 -0700
Subject: [PATCH 11/17] fix tear_down

---
 tests/python/unittest/test_numpy_gluon_data_vision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/python/unittest/test_numpy_gluon_data_vision.py b/tests/python/unittest/test_numpy_gluon_data_vision.py
index ff8c633153c1..4deb15353678 100644
--- a/tests/python/unittest/test_numpy_gluon_data_vision.py
+++ b/tests/python/unittest/test_numpy_gluon_data_vision.py
@@ -26,7 +26,7 @@
 import mxnet as mx
 from mxnet import gluon, autograd, np, npx
 from mxnet.test_utils import use_np, assert_almost_equal, check_gluon_hybridize_consistency, same, check_symbolic_backward
-from common import with_seed, assertRaises, setup_module, with_seed, teardown
+from common import assertRaises, setup_module, with_seed, teardown_module
 import random
 from mxnet.base import MXNetError
 from mxnet.gluon.data.vision import transforms

From 28d5787a8a8f3266e3d5f9bfa875c49916eb2277 Mon Sep 17 00:00:00 2001
From: "Joshua Z. Zhang" <cheungchih@gmail.com>
Date: Tue, 28 Apr 2020 17:53:04 -0700
Subject: [PATCH 12/17] address comments

---
 ci/build_windows.py                                 |  1 -
 python/mxnet/gluon/data/dataloader.py               | 13 ++++++-------
 src/io/iter_sampler.cc                              |  6 +++---
 .../test_numpy_contrib_gluon_data_vision.py         |  1 -
 .../python/unittest/test_numpy_interoperability.py  |  9 ++++-----
 5 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/ci/build_windows.py b/ci/build_windows.py
index 973534e5c0de..c8d3af515b5a 100755
--- a/ci/build_windows.py
+++ b/ci/build_windows.py
@@ -202,7 +202,6 @@ def windows_build(args):
             if ret != 0:
                 build_try += 1
                 logging.info("{} build(s) have failed".format(build_try))
-                sys.exit(1)
             else:
                 logging.info("Build flavour: {} complete in directory: \"{}\"".format(args.flavour, os.path.abspath(path)))
                 logging.info("Build took {}".format(datetime.timedelta(seconds=int(time.time() - t0))))
diff --git a/python/mxnet/gluon/data/dataloader.py b/python/mxnet/gluon/data/dataloader.py
index 3cbfa0e856c5..d991bc769ac9 100644
--- a/python/mxnet/gluon/data/dataloader.py
+++ b/python/mxnet/gluon/data/dataloader.py
@@ -22,7 +22,6 @@
 
 import pickle
 import logging
-import warnings
 import io
 import sys
 import signal
@@ -564,14 +563,14 @@ class DataLoader(object):
         unless you are experiencing timeout and you know it's due to slow data loading.
         Sometimes full `shared_memory` will cause all workers to hang and causes timeout. In these
         cases please reduce `num_workers` or increase system `shared_memory` size instead.
-    try_nopython : bool, default is None
+    try_nopython : bool or None, default is None
         Try compile python dataloading pipeline into pure MXNet c++ implementation. The benefit is
         potentially faster iteration, no `shared_memory` usage, and less processes managed by python.
         The compilation is not gauranteed to support all use cases, but it will fallback to python in
         case of failure. You can set `try_nopython` to `False` to disable auto-detection of the
         compilation feature or leave it to `None` to allow MXNet to determine it automatically.
-        If you request `try_nopython` to `True` and the compilation fails, it will raise a warning and
-        continue with python based implementation.
+        If you request `try_nopython` to `True` and the compilation fails, it will raise a
+        RuntimeError with the failure reason.
 
     """
     def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
@@ -623,14 +622,14 @@ def __init__(self, dataset, batch_size=None, shuffle=False, sampler=None,
                 self._dataset, self._batch_sampler, self._batchify_fn)
             if not use_mx_iter:
                 if try_nopython:
-                    warnings.warn(mx_iter_args)
+                    raise RuntimeError(mx_iter_args)
         else:
             use_mx_iter = False
 
         if use_mx_iter:
             logging.info("Using MXNet backend ThreadedDataLoader with %s workers "
                          "instead of python dataloader.", self._num_workers)
-            self._mx_iter = MXThreadedDataLoader(
+            self._mx_iter = _MXThreadedDataLoader(
                 num_workers=self._num_workers,
                 pin_memory=self._pin_memory,
                 pin_device_id=self._pin_device_id,
@@ -730,7 +729,7 @@ def _check_mx_loader_capability(dataset, batch_sampler, batchify_fn):
     return True, mx_loader_args
 
 
-class MXThreadedDataLoader(object):
+class _MXThreadedDataLoader(object):
     """MXNet internal C++ threaded Data Iterator in form of DataLoader
 
     parameters
diff --git a/src/io/iter_sampler.cc b/src/io/iter_sampler.cc
index 8566d1c983e3..b0d7716ab692 100644
--- a/src/io/iter_sampler.cc
+++ b/src/io/iter_sampler.cc
@@ -126,7 +126,9 @@ class RandomSampler : public IIterator<DataInst> {
     param_.InitAllowUnknown(kwargs);
     indices_.resize(param_.length);
     std::iota(std::begin(indices_), std::end(indices_), 0);  // fill like arange
-    rng_.reset(new common::RANDOM_ENGINE(kRandMagic + param_.seed));
+    mshadow::Random<cpu> *ctx_rng = ResourceManager::Get()->Request(
+      Context::CPU(), ResourceRequest::kRandom).get_random<cpu, real_t>(nullptr);
+    rng_.reset(new common::RANDOM_ENGINE(ctx_rng->GetSeed() + param_.seed));
     out_.data.resize(2);  // label required by DataBatch, we can use fake label here
     out_.data[1] = TBlob(indices_.data(), TShape({1, }), cpu::kDevMask, 0);
     BeforeFirst();
@@ -155,8 +157,6 @@ class RandomSampler : public IIterator<DataInst> {
     return out_;
   }
  private:
-  /*! \brief random magic number */
-  static const int kRandMagic = 2333;
   /*! \brief Stored integer indices */
   std::vector<int64_t> indices_;
   /*! \brief current position for iteration */
diff --git a/tests/python/unittest/test_numpy_contrib_gluon_data_vision.py b/tests/python/unittest/test_numpy_contrib_gluon_data_vision.py
index 14f5f95cd8f7..9c9c7fd1ed56 100644
--- a/tests/python/unittest/test_numpy_contrib_gluon_data_vision.py
+++ b/tests/python/unittest/test_numpy_contrib_gluon_data_vision.py
@@ -133,7 +133,6 @@ def test_image_bbox_iter(self):
     @use_np
     def test_bbox_augmenters(self):
         # only test if all augmenters will work
-        # TODO(Joshua Zhang): verify the augmenter outputs
         im_list = [_generate_objects() + [x] for x in self.IMAGES]
         det_iter = mx.gluon.contrib.data.vision.ImageBboxDataLoader(2, (3, 300, 300), imglist=im_list, path_root='',
             rand_crop=1, rand_pad=1, rand_gray=0.1, rand_mirror=True, mean=True,
diff --git a/tests/python/unittest/test_numpy_interoperability.py b/tests/python/unittest/test_numpy_interoperability.py
index 7309327d27c3..342372c98b30 100644
--- a/tests/python/unittest/test_numpy_interoperability.py
+++ b/tests/python/unittest/test_numpy_interoperability.py
@@ -1818,7 +1818,7 @@ def test_shapes():
             a = np.ones((2,), dtype=dt)
             b = np.ones((2,), dtype=dt)
             OpArgMngr.add_workload('matmul', a, b)
-    
+
     def test_result_types():
         mat = np.ones((1,1))
         vec = np.ones((1,))
@@ -1827,7 +1827,7 @@ def test_result_types():
             v = vec.astype(dt)
             for arg in [(m, v), (v, m), (m, m)]:
                 OpArgMngr.add_workload('matmul', *arg)
-    
+
     def test_scalar_output():
         vec1 = np.array([2])
         vec2 = np.array([3, 4]).reshape(1, -1)
@@ -1836,7 +1836,7 @@ def test_scalar_output():
             v2 = vec2.astype(dt)
             OpArgMngr.add_workload('matmul', v1, v2)
             OpArgMngr.add_workload('matmul', v2.T, v1)
-    
+
     def test_vector_vector_values():
         vec1 = np.array([1, 2])
         vec2 = np.array([3, 4]).reshape(-1, 1)
@@ -1868,7 +1868,7 @@ def test_matrix_vector_values():
             m2 = mat2.astype(dt)
             OpArgMngr.add_workload('matmul', m1, v)
             OpArgMngr.add_workload('matmul', m2, v)
-    
+
     def test_matrix_matrix_values():
         mat1 = np.array([[1, 2], [3, 4]])
         mat2 = np.array([[1, 0], [1, 1]])
@@ -3274,4 +3274,3 @@ def test_np_array_ufunc_protocol():
 def test_np_fallback_ops():
     op_list = np.fallback.__all__ + ['linalg.{}'.format(op_name) for op_name in np.fallback_linalg.__all__]
     check_interoperability(op_list)
-

From 21d17d9f5e15ea24a854c0fe56de349024699546 Mon Sep 17 00:00:00 2001
From: "Joshua Z. Zhang" <cheungchih@gmail.com>
Date: Wed, 29 Apr 2020 12:07:00 -0700
Subject: [PATCH 13/17] thread safe dataset

---
 include/mxnet/io.h                       |  4 --
 src/io/dataloader.cc                     | 15 ++---
 src/io/dataset.cc                        | 86 +++++++-----------------
 tests/python/unittest/test_gluon_data.py |  2 +-
 4 files changed, 28 insertions(+), 79 deletions(-)

diff --git a/include/mxnet/io.h b/include/mxnet/io.h
index 177be27a7c12..4b896066642b 100644
--- a/include/mxnet/io.h
+++ b/include/mxnet/io.h
@@ -134,10 +134,6 @@ class Dataset {
   */
   virtual uint64_t GetLen(void) const = 0;
   /*!
-  *  \brief Create a copy of dataset for threaded worker
-  */
-  virtual Dataset* Clone(void) const = 0;
-  /*!
   *  \brief Get the ndarray items given index in dataset
   *  \param idx the integer index for required data
   *  \param ret the returned ndarray items
diff --git a/src/io/dataloader.cc b/src/io/dataloader.cc
index 2ddd8cf73d89..3589655328b5 100644
--- a/src/io/dataloader.cc
+++ b/src/io/dataloader.cc
@@ -83,14 +83,8 @@ class ThreadedDataLoader : public IIterator<TBlobBatch> {
       threadget = omp_get_num_threads();
     }
     param_.num_workers = std::max(1, threadget);
-    auto dataset = *static_cast<std::shared_ptr<Dataset>*>(reinterpret_cast<void*>(param_.dataset));
-    datasets_.clear();
-    datasets_.reserve(param_.num_workers);
-    datasets_.emplace_back(dataset);
-    for (int i = 1; i < param_.num_workers; ++i) {
-      datasets_.emplace_back(std::shared_ptr<Dataset>(dataset->Clone()));
-    }
-    dataset_len_ = datasets_[0]->GetLen();
+    dataset_ = *static_cast<std::shared_ptr<Dataset>*>(reinterpret_cast<void*>(param_.dataset));
+    dataset_len_ = dataset_->GetLen();
     sampler_ = static_cast<IIterator<DataBatch>* >(reinterpret_cast<void*>(param_.sampler));
     batchify_fn_ = *static_cast<BatchifyFunctionPtr*>(reinterpret_cast<void*>(param_.batchify_fn));
     this->BeforeFirst();
@@ -126,7 +120,7 @@ class ThreadedDataLoader : public IIterator<TBlobBatch> {
     for (int i = 0; i < real_batch_size; ++i) {
       omp_exc_.Run([&] {
         auto idx = idx_ptrs[i];
-        CHECK(datasets_[i % param_.num_workers]->GetItem(idx, &inputs[i]))
+        CHECK(dataset_->GetItem(idx, &inputs[i]))
           << "Error getting data # " << idx;
       });
     }
@@ -170,8 +164,7 @@ class ThreadedDataLoader : public IIterator<TBlobBatch> {
   /*! \brief batched buffer */
   std::vector<NDArray> batched_buffer_;
   /*! \brief pointer to dataset */
-  // std::shared_ptr<Dataset> dataset_;
-  std::vector<std::shared_ptr<Dataset>> datasets_;
+  std::shared_ptr<Dataset> dataset_;
   /*! \brief dataset length */
   int64_t dataset_len_;
   /*! \brief pointer to sampler iterator */
diff --git a/src/io/dataset.cc b/src/io/dataset.cc
index 11cab3672a7c..db2c93638aaf 100644
--- a/src/io/dataset.cc
+++ b/src/io/dataset.cc
@@ -64,10 +64,6 @@ class RecordFileDataset final : public Dataset {
   explicit RecordFileDataset(const std::vector<std::pair<std::string, std::string> >& kwargs) {
     std::vector<std::pair<std::string, std::string> > kwargs_left;
     param_.InitAllowUnknown(kwargs);
-    // open record file for read
-    dmlc::Stream *stream = dmlc::Stream::Create(param_.rec_file.c_str(), "r");
-    reader_ = std::make_shared<dmlc::RecordIOReader>(stream);
-    stream_.reset(stream);
     // read and process idx file
     dmlc::Stream *idx_stream = dmlc::Stream::Create(param_.idx_file.c_str(), "r");
     dmlc::istream is(idx_stream);
@@ -78,19 +74,6 @@ class RecordFileDataset final : public Dataset {
     delete idx_stream;
   }
 
-  RecordFileDataset* Clone(void) const {
-    auto other = new RecordFileDataset(std::vector<std::pair<std::string, std::string> >());
-    other->param_ = param_;
-    other->idx_ = idx_;
-    // do not share the pointer since it's not threadsafe to seek simultaneously
-    if (reader_ && stream_) {
-      dmlc::Stream *stream = dmlc::Stream::Create(param_.rec_file.c_str(), "r");
-      other->reader_ = std::make_shared<dmlc::RecordIOReader>(stream);
-      other->stream_.reset(stream);
-    }
-    return other;
-  }
-
   uint64_t GetLen() const {
     return idx_.size();
   }
@@ -98,33 +81,37 @@ class RecordFileDataset final : public Dataset {
   bool GetItem(uint64_t idx, std::vector<NDArray>* ret) {
     ret->resize(1);
     auto& out = (*ret)[0];
+    auto& reader = RecordIOPair::Get()->second;
+    if (!reader) {
+      auto s = dmlc::Stream::Create(param_.rec_file.c_str(), "r");
+      auto& stream = RecordIOPair::Get()->first;
+      stream.reset(s);
+      reader = std::make_unique<dmlc::RecordIOReader>(s);
+    }
     size_t pos = idx_[static_cast<size_t>(idx)];
-    {
-      std::lock_guard<std::mutex> lck(mutex_);
-      reader_->Seek(pos);
-      if (reader_->NextRecord(&read_buff_)) {
-        const char *buf = read_buff_.c_str();
-        const size_t size = read_buff_.size();
-        out = NDArray(TShape({static_cast<dim_t>(size)}), Context::CPU(), false, mshadow::kInt8);
-        TBlob dst = out.data();
-        RunContext rctx{Context::CPU(), nullptr, nullptr, false};
-        mxnet::ndarray::Copy<cpu, cpu>(
-          TBlob(const_cast<void*>(reinterpret_cast<const void*>(buf)),
-            out.shape(), cpu::kDevMask, out.dtype(), 0),
-            &dst, Context::CPU(), Context::CPU(), rctx);
-      }
+    auto read_buff = ReadBuff::Get();
+    reader->Seek(pos);
+    if (reader->NextRecord(read_buff)) {
+      const char *buf = read_buff->c_str();
+      const size_t size = read_buff->size();
+      out = NDArray(TShape({static_cast<dim_t>(size)}), Context::CPU(), false, mshadow::kInt8);
+      TBlob dst = out.data();
+      RunContext rctx{Context::CPU(), nullptr, nullptr, false};
+      mxnet::ndarray::Copy<cpu, cpu>(
+        TBlob(const_cast<void*>(reinterpret_cast<const void*>(buf)),
+          out.shape(), cpu::kDevMask, out.dtype(), 0),
+          &dst, Context::CPU(), Context::CPU(), rctx);
     }
     return true;
   }
 
  private:
+  using ReaderPtr = std::unique_ptr<dmlc::RecordIOReader>;
+  using StreamPtr = std::unique_ptr<dmlc::Stream>;
+  using RecordIOPair = dmlc::ThreadLocalStore<std::pair<StreamPtr, ReaderPtr> >;
+  using ReadBuff = dmlc::ThreadLocalStore<std::string>;
   /*! \brief parameters */
   RecordFileDatasetParam param_;
-  /*! \brief recordIO context */
-  std::shared_ptr<dmlc::RecordIOReader> reader_;
-  std::shared_ptr<dmlc::Stream> stream_;
-  std::string read_buff_;
-  std::mutex mutex_;
   /*! \brief indices */
   std::unordered_map<size_t, size_t> idx_;
 };
@@ -209,13 +196,6 @@ class ImageRecordFileDataset : public Dataset {
     base_ = std::make_shared<RecordFileDataset>(kwargs);
   }
 
-  ImageRecordFileDataset* Clone(void) const {
-    auto other = new ImageRecordFileDataset(std::vector<std::pair<std::string, std::string> >());
-    other->param_ = param_;
-    other->base_.reset(base_->Clone());
-    return other;
-  }
-
   uint64_t GetLen() const {
     return base_->GetLen();
   }
@@ -315,10 +295,6 @@ class ImageSequenceDataset final : public Dataset {
     img_list_ = dmlc::Split(param_.img_list, param_.path_sep);
   }
 
-  ImageSequenceDataset* Clone(void) const {
-    return new ImageSequenceDataset(*this);
-  }
-
   uint64_t GetLen() const {
     return img_list_.size();
   }
@@ -382,10 +358,6 @@ class NDArrayDataset final : public Dataset {
     size_ = data_.shape().begin()[0];
   }
 
-  NDArrayDataset* Clone(void) const {
-    return new NDArrayDataset(*this);
-  }
-
   uint64_t GetLen() const {
     return size_;
   }
@@ -461,10 +433,6 @@ class GroupDataset final : public Dataset {
     }
   }
 
-  GroupDataset* Clone(void) const {
-    return new GroupDataset(*this);
-  }
-
   uint64_t GetLen() const {
     return size_;
   }
@@ -520,10 +488,6 @@ class IndexedDataset final : public Dataset {
     base_data_ = *static_cast<std::shared_ptr<Dataset>*>(reinterpret_cast<void*>(param_.base));
   }
 
-  IndexedDataset* Clone(void) const {
-    return new IndexedDataset(*this);
-  }
-
   uint64_t GetLen() const {
     return param_.indices.ndim();
   }
@@ -638,10 +602,6 @@ class LazyTransformDataset final : public Dataset {
   virtual ~LazyTransformDataset(void) {
   }
 
-  LazyTransformDataset* Clone(void) const {
-    return new LazyTransformDataset(*this);
-  }
-
   uint64_t GetLen() const {
     return base_data_->GetLen();
   }
diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index 06481e69bc16..0c1a0ad4c173 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -109,7 +109,7 @@ def test_recordimage_dataset_with_data_loader_multiworker():
 
     # with transform
     dataset = gluon.data.vision.ImageRecordDataset(recfile).transform(_dataset_transform_fn)
-    loader = gluon.data.DataLoader(dataset, 1, num_workers=5, try_nopython=True)
+    loader = gluon.data.DataLoader(dataset, 1, num_workers=5, try_nopython=None)
 
     for i, (x, y) in enumerate(loader):
         assert x.shape[0] == 1 and x.shape[3] == 3

From 654ce99dffaa9997e4adef95489c266f77b338b2 Mon Sep 17 00:00:00 2001
From: "Joshua Z. Zhang" <cheungchih@gmail.com>
Date: Wed, 29 Apr 2020 15:56:06 -0700
Subject: [PATCH 14/17] address comments

---
 .../gluon/contrib/data/vision/transforms/bbox/bbox.py  |  2 +-
 python/mxnet/image/image.py                            | 10 +++++-----
 python/mxnet/test_utils.py                             |  2 +-
 src/io/iter_prefetcher.h                               |  2 +-
 src/io/iter_sampler.cc                                 |  8 +++-----
 5 files changed, 11 insertions(+), 13 deletions(-)

diff --git a/python/mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py b/python/mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py
index 42cfb7afaefb..1629c212957f 100644
--- a/python/mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py
+++ b/python/mxnet/gluon/contrib/data/vision/transforms/bbox/bbox.py
@@ -277,7 +277,7 @@ def forward(self, img, bbox):
         if isinstance(self._fill, numeric_types):
             dst = F.full(shape=(oh, ow, c), val=self._fill, dtype=img.dtype)
         else:
-            fill = F.array(self._fill, dtype=img.dtype, ctx=img.context)
+            fill = F.array(self._fill, dtype=img.dtype, ctx=img.ctx)
             if not c == fill.size:
                 raise ValueError("Channel and fill size mismatch, {} vs {}".format(c, fill.size))
             dst = F.tile(fill.reshape((1, c)), reps=(oh * ow, 1)).reshape((oh, ow, c))
diff --git a/python/mxnet/image/image.py b/python/mxnet/image/image.py
index 86b1cf25ae62..4ce4139a0afb 100644
--- a/python/mxnet/image/image.py
+++ b/python/mxnet/image/image.py
@@ -656,14 +656,14 @@ def imrotate(src, rotation_degrees, zoom_in=False, zoom_out=False):
     # when a scalar is passed we wrap it into an array
     if isinstance(rotation_degrees, Number):
         rotation_degrees = nd.array([rotation_degrees] * len(src),
-                                    ctx=src.context)
+                                    ctx=src.ctx)
 
     if len(src) != len(rotation_degrees):
         raise ValueError(
             "The number of images must be equal to the number of rotation angles"
         )
 
-    rotation_degrees = rotation_degrees.as_in_context(src.context)
+    rotation_degrees = rotation_degrees.as_in_context(src.ctx)
     rotation_rad = np.pi * rotation_degrees / 180
     # reshape the rotations angle in order to be broadcasted
     # over the `src` tensor
@@ -674,10 +674,10 @@ def imrotate(src, rotation_degrees, zoom_in=False, zoom_out=False):
     hscale = (float(h - 1) / 2)
     wscale = (float(w - 1) / 2)
     h_matrix = (
-        nd.repeat(nd.arange(h, ctx=src.context).astype('float32').reshape(h, 1), w, axis=1) - hscale
+        nd.repeat(nd.arange(h, ctx=src.ctx).astype('float32').reshape(h, 1), w, axis=1) - hscale
     ).expand_dims(axis=0)
     w_matrix = (
-        nd.repeat(nd.arange(w, ctx=src.context).astype('float32').reshape(1, w), h, axis=0) - wscale
+        nd.repeat(nd.arange(w, ctx=src.ctx).astype('float32').reshape(1, w), h, axis=0) - wscale
     ).expand_dims(axis=0)
     # perform rotation on the grid
     c_alpha = nd.cos(rotation_rad)
@@ -689,7 +689,7 @@ def imrotate(src, rotation_degrees, zoom_in=False, zoom_out=False):
     w_matrix_rot = w_matrix_rot / wscale
     h_matrix_rot = h_matrix_rot / hscale
 
-    h, w = nd.array([h], ctx=src.context), nd.array([w], ctx=src.context)
+    h, w = nd.array([h], ctx=src.ctx), nd.array([w], ctx=src.ctx)
     # compute the scale factor in case `zoom_in` or `zoom_out` are True
     if zoom_in or zoom_out:
         rho_corner = nd.sqrt(h * h + w * w)
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index e9b17ad61b18..81972891d7c9 100644
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -584,7 +584,7 @@ def assert_almost_equal(a, b, rtol=None, atol=None, names=('a', 'b'), equal_nan=
     atol = get_atol(atol)
     use_np_allclose = isinstance(a, np.ndarray) and isinstance(b, np.ndarray)
     if not use_np_allclose:
-        if not (hasattr(a, 'context') and hasattr(b, 'context') and a.context == b.context and a.dtype == b.dtype):
+        if not (hasattr(a, 'ctx') and hasattr(b, 'ctx') and a.ctx == b.ctx and a.dtype == b.dtype):
             use_np_allclose = True
             if isinstance(a, mx.nd.NDArray):
                 a = a.asnumpy()
diff --git a/src/io/iter_prefetcher.h b/src/io/iter_prefetcher.h
index dae11d36e252..c416c9d7b9be 100644
--- a/src/io/iter_prefetcher.h
+++ b/src/io/iter_prefetcher.h
@@ -99,7 +99,7 @@ class PrefetcherIter : public IIterator<DataBatch> {
         // copy data over
         for (size_t i = 0; i < batch.data.size(); ++i) {
           if ((*dptr)->data.at(i).shape() != batch.data[i].shape_) {
-            // perf warning, dynamic buffer might be slow
+            // TODO(zhreshold): memory pool for dynamic shaped data
             (*dptr)->data.at(i).ReshapeAndAlloc(batch.data[i].shape_);
           }
           CHECK_EQ((*dptr)->data.at(i).shape(), batch.data[i].shape_);
diff --git a/src/io/iter_sampler.cc b/src/io/iter_sampler.cc
index b0d7716ab692..0f6e4621fd3f 100644
--- a/src/io/iter_sampler.cc
+++ b/src/io/iter_sampler.cc
@@ -56,8 +56,7 @@ class SequentialSampler : public IIterator<DataInst> {
     param_.InitAllowUnknown(kwargs);
     indices_.resize(param_.length);
     std::iota(std::begin(indices_), std::end(indices_), 0);  // fill like arange
-    out_.data.resize(2);  // label required by DataBatch, we can use fake label here
-    out_.data[1] = TBlob(indices_.data(), TShape({1, }), cpu::kDevMask, 0);
+    out_.data.resize(1);
   }
 
   virtual void BeforeFirst(void) {
@@ -129,8 +128,7 @@ class RandomSampler : public IIterator<DataInst> {
     mshadow::Random<cpu> *ctx_rng = ResourceManager::Get()->Request(
       Context::CPU(), ResourceRequest::kRandom).get_random<cpu, real_t>(nullptr);
     rng_.reset(new common::RANDOM_ENGINE(ctx_rng->GetSeed() + param_.seed));
-    out_.data.resize(2);  // label required by DataBatch, we can use fake label here
-    out_.data[1] = TBlob(indices_.data(), TShape({1, }), cpu::kDevMask, 0);
+    out_.data.resize(1);
     BeforeFirst();
   }
 
@@ -164,7 +162,7 @@ class RandomSampler : public IIterator<DataInst> {
   /*! \brief data for next value */
   DataInst out_;
   /*! \brief random generator engine */
-  std::unique_ptr<common::RANDOM_ENGINE> rng_;
+  std::unique_ptr<std::mt19937> rng_;
   /*! \brief arguments */
   RandomSamplerParam param_;
 };  // class RandomSampler

From 9977a09019e83d1c3338801480bf7366f3a86e7a Mon Sep 17 00:00:00 2001
From: "Joshua Z. Zhang" <cheungchih@gmail.com>
Date: Tue, 5 May 2020 13:32:28 -0700
Subject: [PATCH 15/17] address comments

---
 src/io/dataloader.cc   |  6 +++---
 src/io/dataset.cc      | 28 ++++++++++++++--------------
 src/io/iter_sampler.cc |  8 ++------
 3 files changed, 19 insertions(+), 23 deletions(-)

diff --git a/src/io/dataloader.cc b/src/io/dataloader.cc
index 3589655328b5..947c26202b5c 100644
--- a/src/io/dataloader.cc
+++ b/src/io/dataloader.cc
@@ -48,11 +48,11 @@ struct ThreadedDataLoaderParam : public dmlc::Parameter<ThreadedDataLoaderParam>
       DMLC_DECLARE_FIELD(num_workers).set_default(0)
           .describe("Number of thread workers.");
       DMLC_DECLARE_FIELD(dataset)
-          .describe("Number of thread workers.");
+          .describe("Pointer to shared Dataset.");
       DMLC_DECLARE_FIELD(sampler)
-          .describe("Number of thread workers.");
+          .describe("Pointer to Sampler.");
       DMLC_DECLARE_FIELD(batchify_fn)
-          .describe("Number of thread workers.");
+          .describe("Pointer to Batchify function.");
       DMLC_DECLARE_FIELD(pin_device_id).set_default(-1)
           .describe("If not negative, will move data to pinned memory.");
   }
diff --git a/src/io/dataset.cc b/src/io/dataset.cc
index db2c93638aaf..7fd0405e41d0 100644
--- a/src/io/dataset.cc
+++ b/src/io/dataset.cc
@@ -32,6 +32,7 @@
 #include <string>
 #include <vector>
 #include <algorithm>
+#include <thread>
 
 #include "../imperative/cached_op.h"
 #include "../imperative/naive_cached_op.h"
@@ -81,19 +82,16 @@ class RecordFileDataset final : public Dataset {
   bool GetItem(uint64_t idx, std::vector<NDArray>* ret) {
     ret->resize(1);
     auto& out = (*ret)[0];
-    auto& reader = RecordIOPair::Get()->second;
-    if (!reader) {
+    if (!reader_) {
       auto s = dmlc::Stream::Create(param_.rec_file.c_str(), "r");
-      auto& stream = RecordIOPair::Get()->first;
-      stream.reset(s);
-      reader = std::make_unique<dmlc::RecordIOReader>(s);
+      stream_.reset(s);
+      reader_ = std::make_unique<dmlc::RecordIOReader>(s);
     }
     size_t pos = idx_[static_cast<size_t>(idx)];
-    auto read_buff = ReadBuff::Get();
-    reader->Seek(pos);
-    if (reader->NextRecord(read_buff)) {
-      const char *buf = read_buff->c_str();
-      const size_t size = read_buff->size();
+    reader_->Seek(pos);
+    if (reader_->NextRecord(&read_buff_)) {
+      const char *buf = read_buff_.c_str();
+      const size_t size = read_buff_.size();
       out = NDArray(TShape({static_cast<dim_t>(size)}), Context::CPU(), false, mshadow::kInt8);
       TBlob dst = out.data();
       RunContext rctx{Context::CPU(), nullptr, nullptr, false};
@@ -106,14 +104,16 @@ class RecordFileDataset final : public Dataset {
   }
 
  private:
-  using ReaderPtr = std::unique_ptr<dmlc::RecordIOReader>;
-  using StreamPtr = std::unique_ptr<dmlc::Stream>;
-  using RecordIOPair = dmlc::ThreadLocalStore<std::pair<StreamPtr, ReaderPtr> >;
-  using ReadBuff = dmlc::ThreadLocalStore<std::string>;
   /*! \brief parameters */
   RecordFileDatasetParam param_;
   /*! \brief indices */
   std::unordered_map<size_t, size_t> idx_;
+  /*! \brief thread local recordio stream */
+  static thread_local std::unique_ptr<dmlc::Stream> stream_;
+  /*! \brief thread local recordio reader */
+  static thread_local std::unique_ptr<dmlc::RecordIOReader> reader_;
+  /*! \brief thread local read buffer */
+  static thread_local std::string read_buff_;
 };
 
 MXNET_REGISTER_IO_DATASET(RecordFileDataset)
diff --git a/src/io/iter_sampler.cc b/src/io/iter_sampler.cc
index 0f6e4621fd3f..932bcc9fe38e 100644
--- a/src/io/iter_sampler.cc
+++ b/src/io/iter_sampler.cc
@@ -37,7 +37,7 @@ namespace io {
 struct SequentialSamplerParam : public dmlc::Parameter<SequentialSamplerParam> {
   /*! \brief Length of the sequence. */
   size_t length;
-  /*! \brief Random seed.*/
+  /*! \brief start index.*/
   int start;
   // declare parameters
   DMLC_DECLARE_PARAMETER(SequentialSamplerParam) {
@@ -106,14 +106,10 @@ MXNET_REGISTER_IO_ITER(SequentialSampler)
 struct RandomSamplerParam : public dmlc::Parameter<RandomSamplerParam> {
   /*! \brief Length of the sequence. */
   size_t length;
-  /*! \brief Random seed.*/
-  int seed;
   // declare parameters
   DMLC_DECLARE_PARAMETER(RandomSamplerParam) {
       DMLC_DECLARE_FIELD(length)
           .describe("Length of the sequence.");
-      DMLC_DECLARE_FIELD(seed).set_default(0)
-          .describe("Random seed.");
   }
 };  // struct RandomSamplerParam
 
@@ -127,7 +123,7 @@ class RandomSampler : public IIterator<DataInst> {
     std::iota(std::begin(indices_), std::end(indices_), 0);  // fill like arange
     mshadow::Random<cpu> *ctx_rng = ResourceManager::Get()->Request(
       Context::CPU(), ResourceRequest::kRandom).get_random<cpu, real_t>(nullptr);
-    rng_.reset(new common::RANDOM_ENGINE(ctx_rng->GetSeed() + param_.seed));
+    rng_.reset(new common::RANDOM_ENGINE(ctx_rng->GetSeed()));
     out_.data.resize(1);
     BeforeFirst();
   }

From 46707f2448cbf5c7818e1170f0f9d4631b7cd9b5 Mon Sep 17 00:00:00 2001
From: "Joshua Z. Zhang" <cheungchih@gmail.com>
Date: Tue, 5 May 2020 14:18:45 -0700
Subject: [PATCH 16/17] fix

---
 src/io/dataset.cc | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/src/io/dataset.cc b/src/io/dataset.cc
index 7fd0405e41d0..6ae174a010a1 100644
--- a/src/io/dataset.cc
+++ b/src/io/dataset.cc
@@ -82,16 +82,19 @@ class RecordFileDataset final : public Dataset {
   bool GetItem(uint64_t idx, std::vector<NDArray>* ret) {
     ret->resize(1);
     auto& out = (*ret)[0];
-    if (!reader_) {
+    static thread_local std::unique_ptr<dmlc::Stream> stream;
+    static thread_local std::unique_ptr<dmlc::RecordIOReader> reader;
+    if (!reader) {
       auto s = dmlc::Stream::Create(param_.rec_file.c_str(), "r");
-      stream_.reset(s);
-      reader_ = std::make_unique<dmlc::RecordIOReader>(s);
+      stream.reset(s);
+      reader = std::make_unique<dmlc::RecordIOReader>(s);
     }
     size_t pos = idx_[static_cast<size_t>(idx)];
-    reader_->Seek(pos);
-    if (reader_->NextRecord(&read_buff_)) {
-      const char *buf = read_buff_.c_str();
-      const size_t size = read_buff_.size();
+    reader->Seek(pos);
+    static thread_local std::string read_buff;
+    if (reader->NextRecord(&read_buff)) {
+      const char *buf = read_buff.c_str();
+      const size_t size = read_buff.size();
       out = NDArray(TShape({static_cast<dim_t>(size)}), Context::CPU(), false, mshadow::kInt8);
       TBlob dst = out.data();
       RunContext rctx{Context::CPU(), nullptr, nullptr, false};
@@ -108,12 +111,6 @@ class RecordFileDataset final : public Dataset {
   RecordFileDatasetParam param_;
   /*! \brief indices */
   std::unordered_map<size_t, size_t> idx_;
-  /*! \brief thread local recordio stream */
-  static thread_local std::unique_ptr<dmlc::Stream> stream_;
-  /*! \brief thread local recordio reader */
-  static thread_local std::unique_ptr<dmlc::RecordIOReader> reader_;
-  /*! \brief thread local read buffer */
-  static thread_local std::string read_buff_;
 };
 
 MXNET_REGISTER_IO_DATASET(RecordFileDataset)

From 1ef68175485144615a1cdbf67e8cf3c0698d6e34 Mon Sep 17 00:00:00 2001
From: "Joshua Z. Zhang" <cheungchih@gmail.com>
Date: Wed, 6 May 2020 23:14:45 -0700
Subject: [PATCH 17/17] serial pytest for data download

---
 tests/python/unittest/test_gluon_data.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/python/unittest/test_gluon_data.py b/tests/python/unittest/test_gluon_data.py
index 0c1a0ad4c173..035ae8f19012 100644
--- a/tests/python/unittest/test_gluon_data.py
+++ b/tests/python/unittest/test_gluon_data.py
@@ -30,6 +30,7 @@
 from mxnet import context
 from mxnet.gluon.data.dataset import Dataset
 from mxnet.gluon.data.dataset import ArrayDataset
+import pytest
 
 @with_seed()
 def test_array_dataset():
@@ -149,6 +150,7 @@ def test_datasets():
     assert len(gluon.data.vision.CIFAR100(root='data/cifar100', train=False)) == 10000
 
 @with_seed()
+@pytest.mark.serial
 def test_datasets_handles():
     assert len(gluon.data.vision.MNIST(root='data/mnist').__mx_handle__()) == 60000
     assert len(gluon.data.vision.MNIST(root='data/mnist', train=False).__mx_handle__()) == 10000