diff --git a/benchmark/opperf/utils/profiler_utils.py b/benchmark/opperf/utils/profiler_utils.py
index 1cb29a8fdec8..087746ab728d 100644
--- a/benchmark/opperf/utils/profiler_utils.py
+++ b/benchmark/opperf/utils/profiler_utils.py
@@ -117,7 +117,7 @@ def parse_profiler_dump(operator_name, profiler_dump):
     MXNDArrayFree                          49           1.1220           0.0170           0.0360           0.0229
     MXAutogradBackwardEx                   50          11.5460           0.1980           0.3360           0.2309
     MXNet C API Calls                     399           1.9990           1.6010           1.9990           0.1990
-    MXImperativeInvokeEx                   50           4.4810           0.0700           0.1330           0.0896
+    MXImperativeInvoke                     50           4.4810           0.0700           0.1330           0.0896
     MXNDArrayWaitAll                       50         769.0570          14.0200          24.5030          15.3811
     MXAutogradSetIsTraining               100           0.0190           0.0000           0.0010           0.0002
     MXAutogradSetIsRecording              100           0.0400           0.0000           0.0010           0.0004
diff --git a/docs/static_site/src/pages/api/cpp/docs/tutorials/multi_threaded_inference.md b/docs/static_site/src/pages/api/cpp/docs/tutorials/multi_threaded_inference.md
index f490aa12e6fc..08cfea115c7d 100644
--- a/docs/static_site/src/pages/api/cpp/docs/tutorials/multi_threaded_inference.md
+++ b/docs/static_site/src/pages/api/cpp/docs/tutorials/multi_threaded_inference.md
@@ -50,12 +50,12 @@ for MXNet users to do multi-threaded inference.
  * \brief create cached operator, allows to choose thread_safe version
  * of cachedop
  */
-MXNET_DLL int MXCreateCachedOpEX(SymbolHandle handle,
-                                 int num_flags,
-                                 const char** keys,
-                                 const char** vals,
-                                 CachedOpHandle *out,
-                                 bool thread_safe DEFAULT(false));
+MXNET_DLL int MXCreateCachedOp(SymbolHandle handle,
+                               int num_flags,
+                               const char** keys,
+                               const char** vals,
+                               CachedOpHandle *out,
+                               bool thread_safe DEFAULT(false));
 ```
 
 ## Multithreaded inference in MXNet with C API and CPP Package
@@ -135,8 +135,8 @@ The above code loads params and copies input data and params to specific context
 [https://github.com/apache/incubator-mxnet/example/multi_threaded_inference/multi_threaded_inference.cc#L207-L233](multi_threaded_inference.cc#L207-233)
 
 The above code prepares `flag_key_cstrs` and `flag_val_cstrs` to be passed the Cached op.
-The C API call is made with `MXCreateCachedOpEX`. This will lead to creation of thread safe cached
-op since the `thread_safe` (which is the last parameter to `MXCreateCachedOpEX`) is set to
+The C API call is made with `MXCreateCachedOp`. This will lead to creation of thread safe cached
+op since the `thread_safe` (which is the last parameter to `MXCreateCachedOp`) is set to
 true. When this is set to false, it will invoke CachedOp instead of CachedOpThreadSafe.
 
 
@@ -146,7 +146,7 @@ true. When this is set to false, it will invoke CachedOp instead of CachedOpThre
 
 The above creates the lambda function taking the thread number as the argument.
 If `random_sleep` is set it will sleep for a random number (secs) generated between 0 to 5 seconds.
-Following this, it invokes `MXInvokeCachedOpEx`(from the hdl it determines whether to invoke cached op threadsafe version or not).
+Following this, it invokes `MXInvokeCachedOp`(from the hdl it determines whether to invoke cached op threadsafe version or not).
 When this is set to false, it will invoke CachedOp instead of CachedOpThreadSafe.
 
 ### Step 5: Spawn multiple threads and wait for all threads to complete
@@ -179,7 +179,7 @@ The above code outputs results for different threads and cleans up the thread sa
 6. Bulking of ops is not supported.
 7. This only supports inference use cases currently, training use cases are not supported.
 8. Graph rewrites with subgraph API currently not supported.
-9. There is currently no frontend API support to run multi threaded inference. Users can use CreateCachedOpEX and InvokeCachedOp in combination with
+9. There is currently no frontend API support to run multi threaded inference. Users can use CreateCachedOp and InvokeCachedOp in combination with
 the CPP frontend to run multi-threaded inference as of today.
 10. Multi threaded inference with threaded engine with Module/Symbolic API and C Predict API are not currently supported.
 11. Exception thrown with `wait_to_read` in individual threads can cause issues. Calling invoke from each thread and calling WaitAll after thread joins should still work fine.
diff --git a/docs/static_site/src/pages/api/developer_guide/profiling.md b/docs/static_site/src/pages/api/developer_guide/profiling.md
index 841c00891b6b..8fad066afce8 100644
--- a/docs/static_site/src/pages/api/developer_guide/profiling.md
+++ b/docs/static_site/src/pages/api/developer_guide/profiling.md
@@ -130,11 +130,11 @@ MXNET_C_API
 =================
 Name                          Total Count        Time (ms)    Min Time (ms)    Max Time (ms)    Avg Time (ms)
 ----                          -----------        ---------    -------------    -------------    -------------
-MXImperativeInvokeEx                    2           0.3360           0.0990           0.2370           0.1680
+MXImperativeInvoke                      2           0.3360           0.0990           0.2370           0.1680
 MXNet C API Calls                      17           0.2320           0.2160           0.2320           0.0080
 MXNDArraySyncCopyFromCPU                1           0.1750           0.1750           0.1750           0.1750
-MXNDArrayCreateEx                       1           0.1050           0.1050           0.1050           0.1050
-MXNDArrayGetShapeEx                    11           0.0210           0.0000           0.0160           0.0019
+MXNDArrayCreate                         1           0.1050           0.1050           0.1050           0.1050
+MXNDArrayGetShape                      11           0.0210           0.0000           0.0160           0.0019
 MXNDArrayWaitAll                        1           0.0200           0.0200           0.0200           0.0200
 MXNDArrayGetDType                       1           0.0010           0.0010           0.0010           0.0010
 MXNet C API Concurrency                34           0.0000           0.0000           0.0010           0.0000
@@ -157,8 +157,8 @@ The profiling data has captured info about interesting functions that have execu
 
 |**Function Name**	|**Description**	|
 |---	|---	|
-|**MXImperativeInvokeEx**	| invokes an operator to perform the computation |
-|**MXNDArrayCreateEx**	| creates  an ndarray	|
+|**MXImperativeInvoke**	| invokes an operator to perform the computation |
+|**MXNDArrayCreate**	| creates  an ndarray	|
 | **MXNDArrayGetDType**	| returns  the data type of the ndarray |
 | **MXNDArrayGetShape**	| returns  the shape of the ndarray (as a tuple where each element is the size of a  dimension) |
 | **MXNDArraySyncCopyFromCPU** | called when data is initially residing outside of an MXNet data structure (ie.  numpy.ndarry rather than mxnet.numpy.ndarray). Data is copied into the MXNet  data structure   |
@@ -201,7 +201,7 @@ In the following list, #1 uses regular numpy functions to initialize data. MXNet
 ![dev_guide_profilling_3.png](/assets/img/dev_guide_profilling_3.png)
 Here, the four red arrows show the important events in this sequence.
 
-1. First, the `MXNDArrayCreateEx` is called to physically  allocate space to store the data and other necessary attributes in the `ndarray` class.
+1. First, the `MXNDArrayCreate` is called to physically  allocate space to store the data and other necessary attributes in the `ndarray` class.
 2. Then some support functions are called (`MXNDArrayGetShape,` `MXNDArrayGetDType`) while initialing the data structure.
 3. Finally the data is copied from the non-MXNet ndarray into the newly prepared MXNet ndarray by the `MXNDArraySyncCopyFromCPU`  function.
 
@@ -210,9 +210,9 @@ Next, #3 (in our code example) begins the computing process to produce our outpu
 ![dev_guide_profilling_4.png](/assets/img/dev_guide_profilling_4.png)
 Here you can see that the following sequence of events happen:
 
-1. `MXImperativeInvokeEx` is called the first time to launch the diagonal operator from #3 (in our code example).
+1. `MXImperativeInvoke` is called the first time to launch the diagonal operator from #3 (in our code example).
 2. Soon after that the actual **`diag`**  operator begins executing in another thread.
-3. While that is happening, our main thread moves on and calls `MXImperativeInvokeEx` again to launch the **`sum`**  operator. Just like before, this returns without actually executing the operator  and continues.
+3. While that is happening, our main thread moves on and calls `MXImperativeInvoke` again to launch the **`sum`**  operator. Just like before, this returns without actually executing the operator  and continues.
 4. Lastly, the `MXNDArrayWaitAll` is called as the main thread has progressed to #4 in our app. It will wait here while all the  computation finishes.
 
 Next lets look at a view of the part of the timeline zoomed to the actual operator execution.
@@ -274,6 +274,6 @@ The first red box is the first run, and the 2nd smaller one is the 2nd run. Firs
 
 
 ![dev_guide_profilling_7.png](/assets/img/dev_guide_profilling_7.png)
-We still have the same sequence of events at the beginning to initialize the MXNet ndarray (`MXNDArrayCreateEx`, `MXNDArrayGetShape`, `MXNDArrayGetDType`, `MXNDArraySyncCopyFromCPU`). Then the **`diag`** operator runs, followed by the **`sum`** operator, and finally the `waitall`. When you look at this, be careful about the assumptions that you make. In this version of the timeline, it appears that the operator executes after the `MXImperativeInvokeEx` runs, and seems to imply an inherent ordering. But realize that there is no dependency between the **`diag`** operator finishing and the next **`MXImperativeInvokeEx`** launching the **`sum`** operator. In this case, it just-so-happens that the **`diag`** operator finishes so quickly that it appears that way. But in reality the main thread is launching the operators and not waiting for them to finish. Lastly, keep in mind that in this case by the time we hit the **`MXNDArrayWaitAll`** everything is already done and we return immediately, but in other circumstances it may sit here waiting for everything to finish (like we saw earlier in the first run). 
+We still have the same sequence of events at the beginning to initialize the MXNet ndarray (`MXNDArrayCreate`, `MXNDArrayGetShape`, `MXNDArrayGetDType`, `MXNDArraySyncCopyFromCPU`). Then the **`diag`** operator runs, followed by the **`sum`** operator, and finally the `waitall`. When you look at this, be careful about the assumptions that you make. In this version of the timeline, it appears that the operator executes after the `MXImperativeInvoke` runs, and seems to imply an inherent ordering. But realize that there is no dependency between the **`diag`** operator finishing and the next **`MXImperativeInvoke`** launching the **`sum`** operator. In this case, it just-so-happens that the **`diag`** operator finishes so quickly that it appears that way. But in reality the main thread is launching the operators and not waiting for them to finish. Lastly, keep in mind that in this case by the time we hit the **`MXNDArrayWaitAll`** everything is already done and we return immediately, but in other circumstances it may sit here waiting for everything to finish (like we saw earlier in the first run). 
 
 
diff --git a/example/multi_threaded_inference/multi_threaded_inference.cc b/example/multi_threaded_inference/multi_threaded_inference.cc
index f1d0d72ef774..b0b6869027d7 100644
--- a/example/multi_threaded_inference/multi_threaded_inference.cc
+++ b/example/multi_threaded_inference/multi_threaded_inference.cc
@@ -226,9 +226,9 @@ void run_inference(const std::string& model_name, const std::vector<mxnet::cpp::
     flag_val_cstrs.emplace_back(flag_vals[i].c_str());
   }
 
-  int ret1 = MXCreateCachedOpEX(out.GetHandle(), flag_keys.size(),
-                                flag_key_cstrs.data(), flag_val_cstrs.data(),
-                                &hdl, true);
+  int ret1 = MXCreateCachedOp(out.GetHandle(), flag_keys.size(),
+                              flag_key_cstrs.data(), flag_val_cstrs.data(),
+                              &hdl, true);
   if (ret1 < 0) {
     LOG(FATAL) << MXGetLastError();
   }
@@ -256,8 +256,8 @@ void run_inference(const std::string& model_name, const std::vector<mxnet::cpp::
     }
     int num_output = 0;
     const int *stypes;
-    int ret = MXInvokeCachedOpEx(hdl, arr_handles[num].size(), arr_handles[num].data(),
-                                 cpu::kDevMask, 0, &num_output, &(cached_op_handles[num]), &stypes);
+    int ret = MXInvokeCachedOp(hdl, arr_handles[num].size(), arr_handles[num].data(),
+                               cpu::kDevMask, 0, &num_output, &(cached_op_handles[num]), &stypes);
     if (ret < 0) {
       LOG(FATAL) << MXGetLastError();
     }
diff --git a/include/mxnet/c_api.h b/include/mxnet/c_api.h
index 01ce18859ef4..1f900dd6f0d3 100644
--- a/include/mxnet/c_api.h
+++ b/include/mxnet/c_api.h
@@ -351,16 +351,6 @@ MXNET_DLL int MXDumpProcessProfile(int finished, int profile_process, KVStoreHan
  */
 MXNET_DLL int MXDumpProfile(int finished);
 
-
-/*!
- * \brief Deprecated, use MXAggregateProfileStatsPrintEx instead.
- * \param out_str Will receive a pointer to the output string
- * \param reset Clear the aggregate stats after printing
- * \return 0 when success, -1 when failure happens.
- * \note
- */
-MXNET_DLL int MXAggregateProfileStatsPrint(const char **out_str, int reset);
-
 /*!
  * \brief Print sorted aggregate stats to the a string
  *        How aggregate stats are stored will not change
@@ -372,8 +362,8 @@ MXNET_DLL int MXAggregateProfileStatsPrint(const char **out_str, int reset);
  * \return 0 when success, -1 when failure happens.
  * \note
  */
-MXNET_DLL int MXAggregateProfileStatsPrintEx(const char **out_str, int reset, int format,
-                                            int sort_by, int ascending);
+MXNET_DLL int MXAggregateProfileStatsPrint(const char **out_str, int reset, int format,
+                                           int sort_by, int ascending);
 
 /*!
  * \brief Pause profiler tuning collection
@@ -584,14 +574,18 @@ MXNET_DLL int MXLoadTVMConfig(ConfigSpaces config);
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXNDArrayCreateNone(NDArrayHandle *out);
+
 /*!
- * \brief create a NDArray with specified shape
+ * \brief create a NDArray with specified shape and data type
+ *  This api is available when MXNet is built with flag
+ *  USE_INT64_TENSOR_SIZE=0 (by default)
  * \param shape the pointer to the shape
  * \param ndim the dimension of the shape
  * \param dev_type device type, specify device we want to take
  * \param dev_id the device id of the specific device
  * \param delay_alloc whether to delay allocation until
  *    the narray is first mutated
+ * \param dtype data type of created array
  * \param out the returning handle
  * \return 0 when success, -1 when failure happens
  */
@@ -600,13 +594,15 @@ MXNET_DLL int MXNDArrayCreate(const uint32_t *shape,
                               int dev_type,
                               int dev_id,
                               int delay_alloc,
+                              int dtype,
                               NDArrayHandle *out);
+#define MXNDArrayCreateEx MXNDArrayCreate  // backward compatibility for external deps
 
 /*!
  * \brief create a NDArray with specified shape and data type
  *  This api is available when MXNet is built with flag
- *  USE_INT64_TENSOR_SIZE=0 (by default)
- * \param shape the pointer to the shape
+ *  USE_INT64_TENSOR_SIZE=1 (not default) i.e. Large Tensor Support
+ * \param shape the pointer to int64_t shape
  * \param ndim the dimension of the shape
  * \param dev_type device type, specify device we want to take
  * \param dev_id the device id of the specific device
@@ -616,36 +612,14 @@ MXNET_DLL int MXNDArrayCreate(const uint32_t *shape,
  * \param out the returning handle
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayCreateEx(const uint32_t *shape,
-                                uint32_t ndim,
+MXNET_DLL int MXNDArrayCreate64(const int64_t *shape,
+                                int ndim,
                                 int dev_type,
                                 int dev_id,
                                 int delay_alloc,
                                 int dtype,
                                 NDArrayHandle *out);
 
-/*!
- * \brief create a NDArray with specified shape and data type
- *  This api is available when MXNet is built with flag
- *  USE_INT64_TENSOR_SIZE=1 (not default) i.e. Large Tensor Support
- * \param shape the pointer to int64_t shape
- * \param ndim the dimension of the shape
- * \param dev_type device type, specify device we want to take
- * \param dev_id the device id of the specific device
- * \param delay_alloc whether to delay allocation until
- *    the narray is first mutated
- * \param dtype data type of created array
- * \param out the returning handle
- * \return 0 when success, -1 when failure happens
- */
-MXNET_DLL int MXNDArrayCreateEx64(const int64_t *shape,
-                                  int ndim,
-                                  int dev_type,
-                                  int dev_id,
-                                  int delay_alloc,
-                                  int dtype,
-                                  NDArrayHandle *out);
-
 /*!
  * \brief create an empty sparse NDArray with specified shape and data type
  *  This api is available when MXNet is built with flag
@@ -943,17 +917,6 @@ MXNET_DLL int MXNDArrayReshape64(NDArrayHandle handle,
                                  dim_t *dims,
                                  bool reverse,
                                  NDArrayHandle *out);
-/*!
- * \brief DEPRECATED. Use MXNDArrayGetShapeEx instead.
- * get the shape of the array
- * \param handle the handle to the narray
- * \param out_dim the output dimension
- * \param out_pdata pointer holder to get data pointer of the shape
- * \return 0 when success, -1 when failure happens
- */
-MXNET_DLL int MXNDArrayGetShape(NDArrayHandle handle,
-                                uint32_t *out_dim,
-                                const uint32_t **out_pdata);
 
 /*!
  * \brief get the shape of the array
@@ -964,9 +927,9 @@ MXNET_DLL int MXNDArrayGetShape(NDArrayHandle handle,
  * \param out_pdata pointer holder to get data pointer of the shape
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayGetShapeEx(NDArrayHandle handle,
-                                  int *out_dim,
-                                  const int **out_pdata);
+MXNET_DLL int MXNDArrayGetShape(NDArrayHandle handle,
+                                int *out_dim,
+                                const int **out_pdata);
 
 /*!
  * \brief get the shape of the array
@@ -977,9 +940,9 @@ MXNET_DLL int MXNDArrayGetShapeEx(NDArrayHandle handle,
  * \param out_pdata pointer holder to get data pointer of the shape
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXNDArrayGetShapeEx64(NDArrayHandle handle,
-                                    int *out_dim,
-                                    const int64_t **out_pdata);
+MXNET_DLL int MXNDArrayGetShape64(NDArrayHandle handle,
+                                  int *out_dim,
+                                  const int64_t **out_pdata);
 
 /*!
  * \brief get the content of the data in NDArray
@@ -1002,8 +965,7 @@ MXNET_DLL int MXNDArrayToDLPack(NDArrayHandle handle,
                                        DLManagedTensorHandle *out_dlpack);
 
 /*!
-* \brief DEPRECATED. Use MXNDArrayFromDLPackEx instead.
-
+* \brief Create a NDArray backed by a dlpack tensor.
 *
 * This allows us to create a NDArray using the memory
 * allocated by an external deep learning framework
@@ -1017,26 +979,9 @@ MXNET_DLL int MXNDArrayToDLPack(NDArrayHandle handle,
 * \return 0 when success, -1 when failure happens
 */
 MXNET_DLL int MXNDArrayFromDLPack(DLManagedTensorHandle dlpack,
+                                  const bool transient_handle,
                                   NDArrayHandle *out_handle);
 
-/*!
-* \brief Create a NDArray backed by a dlpack tensor.
-*
-* This allows us to create a NDArray using the memory
-* allocated by an external deep learning framework
-* that is DLPack compatible.
-*
-* The memory is retained until the NDArray went out of scope.
-*
-* \param dlpack the pointer of the input DLManagedTensor
-* \param transient_handle whether the handle will be destructed before calling the deleter
-* \param out_handle pointer holder to get pointer of NDArray
-* \return 0 when success, -1 when failure happens
-*/
-MXNET_DLL int MXNDArrayFromDLPackEx(DLManagedTensorHandle dlpack,
-                                    const bool transient_handle,
-                                    NDArrayHandle *out_handle);
-
 /*!
  * \brief Delete a dlpack tensor
  * \param dlpack the pointer of the input DLManagedTensor
@@ -1200,20 +1145,6 @@ MXNET_DLL int MXFuncDescribe(FunctionHandle fun,
                              uint32_t *num_scalars,
                              uint32_t *num_mutate_vars,
                              int *type_mask);
-/*!
- * \brief invoke a function, the array size of passed in arguments
- *   must match the values in the
- * \param fun the function
- * \param use_vars the normal arguments passed to function
- * \param scalar_args the scalar qarguments
- * \param mutate_vars the mutate arguments
- * \return 0 when success, -1 when failure happens
- * \sa MXFuncDescribeArgs
- */
-MXNET_DLL int MXFuncInvoke(FunctionHandle fun,
-                           NDArrayHandle *use_vars,
-                           float *scalar_args,
-                           NDArrayHandle *mutate_vars);
 /*!
  * \brief invoke a function, the array size of passed in arguments
  *   must match the values in the
@@ -1227,13 +1158,13 @@ MXNET_DLL int MXFuncInvoke(FunctionHandle fun,
  * \return 0 when success, -1 when failure happens
  * \sa MXFuncDescribeArgs
  */
-MXNET_DLL int MXFuncInvokeEx(FunctionHandle fun,
-                             NDArrayHandle *use_vars,
-                             float *scalar_args,
-                             NDArrayHandle *mutate_vars,
-                             int num_params,
-                             char **param_keys,
-                             char **param_vals);
+MXNET_DLL int MXFuncInvoke(FunctionHandle fun,
+                           NDArrayHandle *use_vars,
+                           float *scalar_args,
+                           NDArrayHandle *mutate_vars,
+                           int num_params,
+                           char **param_keys,
+                           char **param_vals);
 /*!
  * \brief invoke a nnvm op and imperative function
  * \param creator the op
@@ -1244,6 +1175,7 @@ MXNET_DLL int MXFuncInvokeEx(FunctionHandle fun,
  * \param num_params number of keyword parameters
  * \param param_keys keys for keyword parameters
  * \param param_vals values for keyword parameters
+ * \param out_stypes output ndarrays' stypes
  * \return 0 when success, -1 when failure happens
  */
 MXNET_DLL int MXImperativeInvoke(AtomicSymbolCreator creator,
@@ -1253,29 +1185,8 @@ MXNET_DLL int MXImperativeInvoke(AtomicSymbolCreator creator,
                                  NDArrayHandle **outputs,
                                  int num_params,
                                  const char **param_keys,
-                                 const char **param_vals);
-/*!
- * \brief invoke a nnvm op and imperative function
- * \param creator the op
- * \param num_inputs number of input NDArrays
- * \param inputs input NDArrays
- * \param num_outputs number of output NDArrays
- * \param outputs output NDArrays
- * \param num_params number of keyword parameters
- * \param param_keys keys for keyword parameters
- * \param param_vals values for keyword parameters
- * \param out_stypes output ndarrays' stypes
- * \return 0 when success, -1 when failure happens
- */
-MXNET_DLL int MXImperativeInvokeEx(AtomicSymbolCreator creator,
-                                   int num_inputs,
-                                   NDArrayHandle *inputs,
-                                   int *num_outputs,
-                                   NDArrayHandle **outputs,
-                                   int num_params,
-                                   const char **param_keys,
-                                   const char **param_vals,
-                                   const int **out_stypes);
+                                 const char **param_vals,
+                                 const int **out_stypes);
 /*!
  * \brief set whether to record operator for autograd
  * \param is_recording 1 when recording, 0 when not recording.
@@ -1387,29 +1298,17 @@ MXNET_DLL int MXAutogradBackwardEx(uint32_t num_output,
  * \param out output symbol handle
  */
 MXNET_DLL int MXAutogradGetSymbol(NDArrayHandle handle, SymbolHandle *out);
-/*!
- * \brief create cached operator
- */
-MXNET_DLL int MXCreateCachedOp(SymbolHandle handle, CachedOpHandle *out);
-/*!
- * \brief create cached operator
- */
-MXNET_DLL int MXCreateCachedOpEx(SymbolHandle handle,
-                                 int num_flags,
-                                 const char** keys,
-                                 const char** vals,
-                                 CachedOpHandle *out);
 
 /*!
  * \brief create cached operator, allows to choose thread_safe version
  * of cachedop
  */
-MXNET_DLL int MXCreateCachedOpEX(SymbolHandle handle,
-                                 int num_flags,
-                                 const char** keys,
-                                 const char** vals,
-                                 CachedOpHandle *out,
-                                 bool thread_safe DEFAULT(false));
+MXNET_DLL int MXCreateCachedOp(SymbolHandle handle,
+                               int num_flags,
+                               const char** keys,
+                               const char** vals,
+                               CachedOpHandle *out,
+                               bool thread_safe DEFAULT(false));
 
 /*!
  * \brief free cached operator
@@ -1434,14 +1333,14 @@ MXNET_DLL int MXCachedOpGetOptimizedSymbol(CachedOpHandle handle,
  * \param out_stypes output ndarrays' stypes
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXInvokeCachedOpEx(CachedOpHandle handle,
-                                 int num_inputs,
-                                 NDArrayHandle *inputs,
-                                 int default_dev_type,
-                                 int default_dev_id,
-                                 int *num_outputs,
-                                 NDArrayHandle **outputs,
-                                 const int** out_stypes);
+MXNET_DLL int MXInvokeCachedOp(CachedOpHandle handle,
+                               int num_inputs,
+                               NDArrayHandle *inputs,
+                               int default_dev_type,
+                               int default_dev_id,
+                               int *num_outputs,
+                               NDArrayHandle **outputs,
+                               const int** out_stypes);
 
 /*!
  * \brief cached op set monitor callback
@@ -1824,19 +1723,20 @@ MXNET_DLL int MXSymbolGrad(SymbolHandle sym,
                            uint32_t num_wrt,
                            const char** wrt,
                            SymbolHandle* out);
+
 /*!
- * \brief DEPRECATED. Use MXSymbolInferShapeEx instead.
- * infer shape of unknown input shapes given the known one.
+ * \brief infer shape of unknown input shapes given the known one.
  *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
  *  The call will be treated as a kwargs call if key != NULL or num_args==0, otherwise it is positional.
- *
+ *  This api is available when MXNet is built with flag
+ *  USE_INT64_TENSOR_SIZE=0 (by default)
  * \param sym symbol handle
- * \param num_args numbe of input arguments.
+ * \param num_args number of input arguments.
  * \param keys the key of keyword args (optional)
  * \param arg_ind_ptr the head pointer of the rows in CSR
  * \param arg_shape_data the content of the CSR
  * \param in_shape_size sizeof the returning array of in_shapes
- * \param in_shape_ndim returning array of shape dimensions of each input shape.
+ * \param in_shape_ndim returning array of shape dimensions of eachs input shape.
  * \param in_shape_data returning array of pointers to head of the input shape.
  * \param out_shape_size sizeof the returning array of out_shapes
  * \param out_shape_ndim returning array of shape dimensions of each output shape.
@@ -1851,16 +1751,16 @@ MXNET_DLL int MXSymbolInferShape(SymbolHandle sym,
                                  uint32_t num_args,
                                  const char** keys,
                                  const uint32_t *arg_ind_ptr,
-                                 const uint32_t *arg_shape_data,
+                                 const int *arg_shape_data,
                                  uint32_t *in_shape_size,
-                                 const uint32_t **in_shape_ndim,
-                                 const uint32_t ***in_shape_data,
+                                 const int **in_shape_ndim,
+                                 const int ***in_shape_data,
                                  uint32_t *out_shape_size,
-                                 const uint32_t **out_shape_ndim,
-                                 const uint32_t ***out_shape_data,
+                                 const int **out_shape_ndim,
+                                 const int ***out_shape_data,
                                  uint32_t *aux_shape_size,
-                                 const uint32_t **aux_shape_ndim,
-                                 const uint32_t ***aux_shape_data,
+                                 const int **aux_shape_ndim,
+                                 const int ***aux_shape_data,
                                  int *complete);
 
 /*!
@@ -1868,14 +1768,14 @@ MXNET_DLL int MXSymbolInferShape(SymbolHandle sym,
  *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
  *  The call will be treated as a kwargs call if key != NULL or num_args==0, otherwise it is positional.
  *  This api is available when MXNet is built with flag
- *  USE_INT64_TENSOR_SIZE=0 (by default)
+ *  USE_INT64_TENSOR_SIZE=1 (not default) i.e. Large Tensor Support
  * \param sym symbol handle
  * \param num_args number of input arguments.
  * \param keys the key of keyword args (optional)
  * \param arg_ind_ptr the head pointer of the rows in CSR
  * \param arg_shape_data the content of the CSR
  * \param in_shape_size sizeof the returning array of in_shapes
- * \param in_shape_ndim returning array of shape dimensions of eachs input shape.
+ * \param in_shape_ndim returning array of shape dimensions of each input shape.
  * \param in_shape_data returning array of pointers to head of the input shape.
  * \param out_shape_size sizeof the returning array of out_shapes
  * \param out_shape_ndim returning array of shape dimensions of each output shape.
@@ -1886,71 +1786,33 @@ MXNET_DLL int MXSymbolInferShape(SymbolHandle sym,
  * \param complete whether infer shape completes or more information is needed.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolInferShapeEx(SymbolHandle sym,
+MXNET_DLL int MXSymbolInferShape64(SymbolHandle sym,
                                    uint32_t num_args,
                                    const char** keys,
-                                   const uint32_t *arg_ind_ptr,
-                                   const int *arg_shape_data,
-                                   uint32_t *in_shape_size,
+                                   const int64_t *arg_ind_ptr,
+                                   const int64_t *arg_shape_data,
+                                   size_t *in_shape_size,
                                    const int **in_shape_ndim,
-                                   const int ***in_shape_data,
-                                   uint32_t *out_shape_size,
+                                   const int64_t ***in_shape_data,
+                                   size_t *out_shape_size,
                                    const int **out_shape_ndim,
-                                   const int ***out_shape_data,
-                                   uint32_t *aux_shape_size,
+                                   const int64_t ***out_shape_data,
+                                   size_t *aux_shape_size,
                                    const int **aux_shape_ndim,
-                                   const int ***aux_shape_data,
+                                   const int64_t ***aux_shape_data,
                                    int *complete);
 
 /*!
- * \brief infer shape of unknown input shapes given the known one.
- *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
- *  The call will be treated as a kwargs call if key != NULL or num_args==0, otherwise it is positional.
- *  This api is available when MXNet is built with flag
- *  USE_INT64_TENSOR_SIZE=1 (not default) i.e. Large Tensor Support
- * \param sym symbol handle
- * \param num_args number of input arguments.
- * \param keys the key of keyword args (optional)
- * \param arg_ind_ptr the head pointer of the rows in CSR
- * \param arg_shape_data the content of the CSR
- * \param in_shape_size sizeof the returning array of in_shapes
- * \param in_shape_ndim returning array of shape dimensions of each input shape.
- * \param in_shape_data returning array of pointers to head of the input shape.
- * \param out_shape_size sizeof the returning array of out_shapes
- * \param out_shape_ndim returning array of shape dimensions of each output shape.
- * \param out_shape_data returning array of pointers to head of the output shape.
- * \param aux_shape_size sizeof the returning array of aux_shapes
- * \param aux_shape_ndim returning array of shape dimensions of each auxiliary shape.
- * \param aux_shape_data returning array of pointers to head of the auxiliary shape.
- * \param complete whether infer shape completes or more information is needed.
- * \return 0 when success, -1 when failure happens
- */
-MXNET_DLL int MXSymbolInferShapeEx64(SymbolHandle sym,
-                                     uint32_t num_args,
-                                     const char** keys,
-                                     const int64_t *arg_ind_ptr,
-                                     const int64_t *arg_shape_data,
-                                     size_t *in_shape_size,
-                                     const int **in_shape_ndim,
-                                     const int64_t ***in_shape_data,
-                                     size_t *out_shape_size,
-                                     const int **out_shape_ndim,
-                                     const int64_t ***out_shape_data,
-                                     size_t *aux_shape_size,
-                                     const int **aux_shape_ndim,
-                                     const int64_t ***aux_shape_data,
-                                     int *complete);
-
-/*!
- * \brief DEPRECATED. Use MXSymbolInferShapePartialEx instead.
- * partially infer shape of unknown input shapes given the known one.
+ * \brief partially infer shape of unknown input shapes given the known one.
  *
  *  Return partially inferred results if not all shapes could be inferred.
  *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
  *  The call will be treated as a kwargs call if key != NULL or num_args==0, otherwise it is positional.
+ *  This api is available when MXNet is built with flag
+ *  USE_INT64_TENSOR_SIZE=0 (by default)
  *
  * \param sym symbol handle
- * \param num_args numbe of input arguments.
+ * \param num_args number of input arguments.
  * \param keys the key of keyword args (optional)
  * \param arg_ind_ptr the head pointer of the rows in CSR
  * \param arg_shape_data the content of the CSR
@@ -1970,16 +1832,16 @@ MXNET_DLL int MXSymbolInferShapePartial(SymbolHandle sym,
                                         uint32_t num_args,
                                         const char** keys,
                                         const uint32_t *arg_ind_ptr,
-                                        const uint32_t *arg_shape_data,
+                                        const int *arg_shape_data,
                                         uint32_t *in_shape_size,
-                                        const uint32_t **in_shape_ndim,
-                                        const uint32_t ***in_shape_data,
+                                        const int **in_shape_ndim,
+                                        const int ***in_shape_data,
                                         uint32_t *out_shape_size,
-                                        const uint32_t **out_shape_ndim,
-                                        const uint32_t ***out_shape_data,
+                                        const int **out_shape_ndim,
+                                        const int ***out_shape_data,
                                         uint32_t *aux_shape_size,
-                                        const uint32_t **aux_shape_ndim,
-                                        const uint32_t ***aux_shape_data,
+                                        const int **aux_shape_ndim,
+                                        const int ***aux_shape_data,
                                         int *complete);
 
 /*!
@@ -1989,7 +1851,7 @@ MXNET_DLL int MXSymbolInferShapePartial(SymbolHandle sym,
  *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
  *  The call will be treated as a kwargs call if key != NULL or num_args==0, otherwise it is positional.
  *  This api is available when MXNet is built with flag
- *  USE_INT64_TENSOR_SIZE=0 (by default)
+ *  USE_INT64_TENSOR_SIZE=1 (not default) i.e. Large Tensor Support
  *
  * \param sym symbol handle
  * \param num_args number of input arguments.
@@ -2008,64 +1870,22 @@ MXNET_DLL int MXSymbolInferShapePartial(SymbolHandle sym,
  * \param complete whether infer shape completes or more information is needed.
  * \return 0 when success, -1 when failure happens
  */
-MXNET_DLL int MXSymbolInferShapePartialEx(SymbolHandle sym,
+MXNET_DLL int MXSymbolInferShapePartial64(SymbolHandle sym,
                                           uint32_t num_args,
                                           const char** keys,
-                                          const uint32_t *arg_ind_ptr,
-                                          const int *arg_shape_data,
-                                          uint32_t *in_shape_size,
+                                          const int64_t *arg_ind_ptr,
+                                          const int64_t *arg_shape_data,
+                                          size_t *in_shape_size,
                                           const int **in_shape_ndim,
-                                          const int ***in_shape_data,
-                                          uint32_t *out_shape_size,
+                                          const int64_t ***in_shape_data,
+                                          size_t *out_shape_size,
                                           const int **out_shape_ndim,
-                                          const int ***out_shape_data,
-                                          uint32_t *aux_shape_size,
+                                          const int64_t ***out_shape_data,
+                                          size_t *aux_shape_size,
                                           const int **aux_shape_ndim,
-                                          const int ***aux_shape_data,
+                                          const int64_t ***aux_shape_data,
                                           int *complete);
 
-/*!
- * \brief partially infer shape of unknown input shapes given the known one.
- *
- *  Return partially inferred results if not all shapes could be inferred.
- *  The shapes are packed into a CSR matrix represented by arg_ind_ptr and arg_shape_data
- *  The call will be treated as a kwargs call if key != NULL or num_args==0, otherwise it is positional.
- *  This api is available when MXNet is built with flag
- *  USE_INT64_TENSOR_SIZE=1 (not default) i.e. Large Tensor Support
- *
- * \param sym symbol handle
- * \param num_args number of input arguments.
- * \param keys the key of keyword args (optional)
- * \param arg_ind_ptr the head pointer of the rows in CSR
- * \param arg_shape_data the content of the CSR
- * \param in_shape_size sizeof the returning array of in_shapes
- * \param in_shape_ndim returning array of shape dimensions of each input shape.
- * \param in_shape_data returning array of pointers to head of the input shape.
- * \param out_shape_size sizeof the returning array of out_shapes
- * \param out_shape_ndim returning array of shape dimensions of each output shape.
- * \param out_shape_data returning array of pointers to head of the output shape.
- * \param aux_shape_size sizeof the returning array of aux_shapes
- * \param aux_shape_ndim returning array of shape dimensions of each auxiliary shape.
- * \param aux_shape_data returning array of pointers to head of the auxiliary shape.
- * \param complete whether infer shape completes or more information is needed.
- * \return 0 when success, -1 when failure happens
- */
-MXNET_DLL int MXSymbolInferShapePartialEx64(SymbolHandle sym,
-                                            uint32_t num_args,
-                                            const char** keys,
-                                            const int64_t *arg_ind_ptr,
-                                            const int64_t *arg_shape_data,
-                                            size_t *in_shape_size,
-                                            const int **in_shape_ndim,
-                                            const int64_t ***in_shape_data,
-                                            size_t *out_shape_size,
-                                            const int **out_shape_ndim,
-                                            const int64_t ***out_shape_data,
-                                            size_t *aux_shape_size,
-                                            const int **aux_shape_ndim,
-                                            const int64_t ***aux_shape_data,
-                                            int *complete);
-
 /*!
  * \brief infer type of unknown input types given the known one.
  *  The types are packed into a CSR matrix represented by arg_ind_ptr and arg_type_data
@@ -3168,18 +2988,6 @@ MXNET_DLL int MXRtcCudaKernelCall(CudaKernelHandle handle, int dev_id, void** ar
  */
 MXNET_DLL int MXNDArrayGetSharedMemHandle(NDArrayHandle handle, int* shared_pid,
                                           int* shared_id);
-/*!
- * \brief DEPRECATED. Use MXNDArrayCreateFromSharedMemEx instead.
- * Reconstruct NDArray from shared memory handle
- * \param shared_pid shared PID
- * \param shared_id shared memory id
- * \param shape pointer to NDArray dimensions
- * \param ndim number of NDArray dimensions
- * \param dtype data type of NDArray
- * \param out constructed NDArray
- */
-MXNET_DLL int MXNDArrayCreateFromSharedMem(int shared_pid, int shared_id, const uint32_t *shape,
-                                           uint32_t ndim, int dtype, NDArrayHandle *out);
 
 /*!
  * \brief Release all unreferenced memory from the devices storage managers memory pool
@@ -3197,8 +3005,8 @@ MXNET_DLL int MXStorageEmptyCache(int dev_type, int dev_id);
  * \param dtype data type of NDArray
  * \param out constructed NDArray
  */
-MXNET_DLL int MXNDArrayCreateFromSharedMemEx(int shared_pid, int shared_id, const int *shape,
-                                             int ndim, int dtype, NDArrayHandle *out);
+MXNET_DLL int MXNDArrayCreateFromSharedMem(int shared_pid, int shared_id, const int *shape,
+                                           int ndim, int dtype, NDArrayHandle *out);
 
 /*!
   * \brief Push an asynchronous operation to the engine.
diff --git a/python/mxnet/__init__.py b/python/mxnet/__init__.py
index efdd02a3be6a..501ec3b8ea9b 100644
--- a/python/mxnet/__init__.py
+++ b/python/mxnet/__init__.py
@@ -71,10 +71,6 @@
 # Attribute scope to add attributes to symbolic graphs
 from .attribute import AttrScope
 
-from . import torch
-# use mx.th as short for mx.torch
-from . import torch as th
-
 from . import profiler
 from . import log
 
diff --git a/python/mxnet/_ctypes/ndarray.py b/python/mxnet/_ctypes/ndarray.py
index 4b15b5653a97..a22e7652b880 100644
--- a/python/mxnet/_ctypes/ndarray.py
+++ b/python/mxnet/_ctypes/ndarray.py
@@ -79,7 +79,7 @@ def _imperative_invoke(handle, ndargs, keys, vals, out, is_np_op, output_is_list
     # a handle's stype in _ndarray_cls
     out_stypes = ctypes.POINTER(ctypes.c_int)()
 
-    check_call(_LIB.MXImperativeInvokeEx(
+    check_call(_LIB.MXImperativeInvoke(
         ctypes.c_void_p(handle),
         ctypes.c_int(len(ndargs)),
         c_handle_array(ndargs),
@@ -105,19 +105,20 @@ class CachedOp(object):
     """Cached operator handle."""
     __slots__ = ["handle", "is_np_sym", "_monitor_callback"]
 
-    def __init__(self, sym, flags=()):
+    def __init__(self, sym, flags=(), thread_safe=False):
         self.handle = CachedOpHandle()
         self._monitor_callback = None
 
         from ..symbol.numpy._symbol import _Symbol
         self.is_np_sym = bool(isinstance(sym, _Symbol))
 
-        check_call(_LIB.MXCreateCachedOpEx(
+        check_call(_LIB.MXCreateCachedOp(
             sym.handle,
             len(flags),
             c_str_array([key for key, _ in flags]),
             c_str_array([str(val) for _, val in flags]),
-            ctypes.byref(self.handle)))
+            ctypes.byref(self.handle),
+            ctypes.c_bool(thread_safe)))
 
     def __del__(self):
         check_call(_LIB.MXFreeCachedOp(self.handle))
@@ -167,7 +168,7 @@ def __call__(self, *args, **kwargs):
         else:
             default_ctx = args[0].ctx if default_ctx is None else default_ctx
 
-        check_call(_LIB.MXInvokeCachedOpEx(
+        check_call(_LIB.MXInvokeCachedOp(
             self.handle,
             ctypes.c_int(len(args)),
             c_handle_array(args),
diff --git a/python/mxnet/cython/base.pyi b/python/mxnet/cython/base.pyi
index 3a7606b893f2..1df664ce2482 100644
--- a/python/mxnet/cython/base.pyi
+++ b/python/mxnet/cython/base.pyi
@@ -97,30 +97,31 @@ cdef extern from "mxnet/c_api.h":
     int MXSymbolSetAttr(SymbolHandle symbol,
                         const char* key,
                         const char* value);
-    int MXImperativeInvokeEx(OpHandle creator,
-                             int num_inputs,
-                             NDArrayHandle *inputs,
-                             int *num_outputs,
-                             NDArrayHandle **outputs,
-                             int num_params,
-                             const char **param_keys,
-                             const char **param_vals,
-                             const int **out_stypes);
-    int MXNDArrayFree(NDArrayHandle handle);
-    int MXCreateCachedOpEx(SymbolHandle handle,
-                            int num_flags,
-                            const char** keys,
-                            const char** vals,
-                            CachedOpHandle *out);
-    int MXFreeCachedOp(CachedOpHandle handle);
-    int MXInvokeCachedOpEx(CachedOpHandle handle,
+    int MXImperativeInvoke(OpHandle creator,
                            int num_inputs,
                            NDArrayHandle *inputs,
-                           int default_ctx_type,
-                           int default_ctx_dev_id,
                            int *num_outputs,
                            NDArrayHandle **outputs,
+                           int num_params,
+                           const char **param_keys,
+                           const char **param_vals,
                            const int **out_stypes);
+    int MXNDArrayFree(NDArrayHandle handle);
+    int MXCreateCachedOp(SymbolHandle handle,
+                          int num_flags,
+                          const char** keys,
+                          const char** vals,
+                          CachedOpHandle *out,
+                          _bool thread_safe);
+    int MXFreeCachedOp(CachedOpHandle handle);
+    int MXInvokeCachedOp(CachedOpHandle handle,
+                         int num_inputs,
+                         NDArrayHandle *inputs,
+                         int default_ctx_type,
+                         int default_ctx_dev_id,
+                         int *num_outputs,
+                         NDArrayHandle **outputs,
+                         const int **out_stypes);
     int MXCachedOpRegisterOpHook(NDArrayHandle handle,
                                  CachedOpMonitorCallback callback,
                                  _bool monitor_all);
diff --git a/python/mxnet/cython/ndarray.pyx b/python/mxnet/cython/ndarray.pyx
index f13e65824aec..733c73e683ff 100644
--- a/python/mxnet/cython/ndarray.pyx
+++ b/python/mxnet/cython/ndarray.pyx
@@ -116,12 +116,13 @@ cdef class CachedOp:
         from ..symbol.numpy._symbol import _Symbol
         self.is_np_sym = bool(isinstance(sym, _Symbol))
 
-        CALL(MXCreateCachedOpEx(
+        CALL(MXCreateCachedOp(
             <SymbolHandle>(<unsigned long long>sym.handle.value),
             len(flags),
             CBeginPtr(c_flag_keys),
             CBeginPtr(c_flag_vals),
-            &self.chandle))
+            &self.chandle,
+            False))
 
     def __del__(self):
         CALL(MXFreeCachedOp(self.chandle))
@@ -174,7 +175,7 @@ cdef class CachedOp:
         else:
             p_output_vars = &output_vars[0]
 
-        CALL(MXInvokeCachedOpEx(
+        CALL(MXInvokeCachedOp(
             self.chandle,
             <int>len(args),
             &ndvars[0] if ndvars.size() != 0 else NULL,
@@ -239,7 +240,7 @@ def _imperative_invoke(handle, ndargs, keys, vals, out, is_np_op=0, output_is_li
     cdef vector[const char*] param_keys = SVec2Ptr(ckeys)
     cdef vector[const char*] param_vals = SVec2Ptr(cvals)
 
-    CALL(MXImperativeInvokeEx(
+    CALL(MXImperativeInvoke(
         chandle,
         <int>ndvars.size(),
         &ndvars[0] if ndvars.size() != 0 else NULL,
diff --git a/python/mxnet/dlpack.py b/python/mxnet/dlpack.py
index b5e8ee83304e..9ef005f1bb2a 100644
--- a/python/mxnet/dlpack.py
+++ b/python/mxnet/dlpack.py
@@ -99,7 +99,7 @@ def from_dlpack(dlpack):
         assert ctypes.pythonapi.PyCapsule_IsValid(dlpack, _c_str_dltensor), ValueError(
             'Invalid DLPack Tensor. DLTensor capsules can be consumed only once.')
         dlpack_handle = ctypes.c_void_p(ctypes.pythonapi.PyCapsule_GetPointer(dlpack, _c_str_dltensor))
-        check_call(_LIB.MXNDArrayFromDLPackEx(dlpack_handle, False, ctypes.byref(handle)))
+        check_call(_LIB.MXNDArrayFromDLPack(dlpack_handle, False, ctypes.byref(handle)))
         # Rename PyCapsule (DLPack)
         ctypes.pythonapi.PyCapsule_SetName(dlpack, _c_str_used_dltensor)
         # delete the deleter of the old dlpack
@@ -180,6 +180,6 @@ def _make_dl_managed_tensor(array):
         ndarray.flags['WRITEABLE'] = False
         c_obj = _make_dl_managed_tensor(ndarray)
         handle = NDArrayHandle()
-        check_call(_LIB.MXNDArrayFromDLPackEx(ctypes.byref(c_obj), True, ctypes.byref(handle)))
+        check_call(_LIB.MXNDArrayFromDLPack(ctypes.byref(c_obj), True, ctypes.byref(handle)))
         return array_cls(handle=handle)
     return from_numpy
diff --git a/python/mxnet/ndarray/ndarray.py b/python/mxnet/ndarray/ndarray.py
index fa26dfff9628..0f638a1ed562 100644
--- a/python/mxnet/ndarray/ndarray.py
+++ b/python/mxnet/ndarray/ndarray.py
@@ -183,7 +183,7 @@ def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):
             dtype_type = np.dtype(dtype)
         else:
             dtype_type = np.dtype(dtype).type
-        check_call(_LIB.MXNDArrayCreateEx64(
+        check_call(_LIB.MXNDArrayCreate64(
             c_array_buf(mx_int64, native_array('q', shape)),
             ctypes.c_int(len(shape)),
             ctypes.c_int(ctx.device_typeid),
@@ -205,7 +205,7 @@ def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):
             dtype_type = np.dtype(dtype)
         else:
             dtype_type = np.dtype(dtype).type
-        check_call(_LIB.MXNDArrayCreateEx(
+        check_call(_LIB.MXNDArrayCreate(
             c_array_buf(mx_uint, native_array('I', shape)),
             mx_uint(len(shape)),
             ctypes.c_int(ctx.device_typeid),
@@ -218,7 +218,7 @@ def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):
 
 def _new_from_shared_mem(shared_pid, shared_id, shape, dtype):
     hdl = NDArrayHandle()
-    check_call(_LIB.MXNDArrayCreateFromSharedMemEx(
+    check_call(_LIB.MXNDArrayCreateFromSharedMem(
         ctypes.c_int(shared_pid),
         ctypes.c_int(shared_id),
         c_array(mx_int, shape),
@@ -2426,11 +2426,11 @@ def shape(self):
         ndim = mx_int()
         if _int64_enabled():
             pdata = ctypes.POINTER(mx_int64)()
-            check_call(_LIB.MXNDArrayGetShapeEx64(
+            check_call(_LIB.MXNDArrayGetShape64(
                 self.handle, ctypes.byref(ndim), ctypes.byref(pdata)))
         else:
             pdata = ctypes.POINTER(mx_int)()
-            check_call(_LIB.MXNDArrayGetShapeEx(
+            check_call(_LIB.MXNDArrayGetShape(
                 self.handle, ctypes.byref(ndim), ctypes.byref(pdata)))
         if ndim.value == -1:
             return None
diff --git a/python/mxnet/numpy/multiarray.py b/python/mxnet/numpy/multiarray.py
index b61686738391..87cd5cac2096 100644
--- a/python/mxnet/numpy/multiarray.py
+++ b/python/mxnet/numpy/multiarray.py
@@ -119,7 +119,7 @@ def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):  # pylint: disa
     """
     hdl = NDArrayHandle()
     if _int64_enabled():
-        check_call(_LIB.MXNDArrayCreateEx64(
+        check_call(_LIB.MXNDArrayCreate64(
             c_array_buf(mx_int64, native_array('q', shape)),
             ctypes.c_int(len(shape)),
             ctypes.c_int(ctx.device_typeid),
@@ -141,7 +141,7 @@ def _new_alloc_handle(shape, ctx, delay_alloc, dtype=mx_real_t):  # pylint: disa
             dtype_type = _np.dtype(dtype)
         else:
             dtype_type = _np.dtype(dtype).type
-        check_call(_LIB.MXNDArrayCreateEx(
+        check_call(_LIB.MXNDArrayCreate(
             c_array_buf(mx_uint, native_array('I', shape)),
             mx_uint(len(shape)),
             ctypes.c_int(ctx.device_typeid),
@@ -2331,11 +2331,11 @@ def shape(self):
         num_dim = mx_int()
         if _int64_enabled():
             pdata = ctypes.POINTER(mx_int64)()
-            check_call(_LIB.MXNDArrayGetShapeEx64(
+            check_call(_LIB.MXNDArrayGetShape64(
                 self.handle, ctypes.byref(num_dim), ctypes.byref(pdata)))
         else:
             pdata = ctypes.POINTER(mx_int)()
-            check_call(_LIB.MXNDArrayGetShapeEx(
+            check_call(_LIB.MXNDArrayGetShape(
                 self.handle, ctypes.byref(num_dim), ctypes.byref(pdata)))
         if num_dim.value == -1:
             return None
diff --git a/python/mxnet/profiler.py b/python/mxnet/profiler.py
index d43f7383daa3..1b9583e1ecbb 100644
--- a/python/mxnet/profiler.py
+++ b/python/mxnet/profiler.py
@@ -185,11 +185,11 @@ def dumps(reset=False, format='table', sort_by='total', ascending=False):
             "Invalid value provided for ascending: {0}. Support: False, True".format(ascending)
     assert  reset in reset_to_int.keys(),\
             "Invalid value provided for reset: {0}. Support: False, True".format(reset)
-    check_call(_LIB.MXAggregateProfileStatsPrintEx(ctypes.byref(debug_str),
-                                                   reset_to_int[reset],
-                                                   format_to_int[format],
-                                                   sort_by_to_int[sort_by],
-                                                   asc_to_int[ascending]))
+    check_call(_LIB.MXAggregateProfileStatsPrint(ctypes.byref(debug_str),
+                                                 reset_to_int[reset],
+                                                 format_to_int[format],
+                                                 sort_by_to_int[sort_by],
+                                                 asc_to_int[ascending]))
     return py_str(debug_str.value)
 
 
diff --git a/python/mxnet/symbol/symbol.py b/python/mxnet/symbol/symbol.py
index 039ac0d9d195..b957675088e0 100644
--- a/python/mxnet/symbol/symbol.py
+++ b/python/mxnet/symbol/symbol.py
@@ -1242,9 +1242,9 @@ def _infer_shape_impl(self, partial, *args, **kwargs):
             out_shape_data = ctypes.POINTER(ctypes.POINTER(mx_int64))()
             aux_shape_data = ctypes.POINTER(ctypes.POINTER(mx_int64))()
             if partial:
-                infer_func = _LIB.MXSymbolInferShapePartialEx64
+                infer_func = _LIB.MXSymbolInferShapePartial64
             else:
-                infer_func = _LIB.MXSymbolInferShapeEx64
+                infer_func = _LIB.MXSymbolInferShape64
             check_call(infer_func(
                 self.handle,
                 mx_uint(len(indptr) - 1),
@@ -1271,9 +1271,9 @@ def _infer_shape_impl(self, partial, *args, **kwargs):
             out_shape_data = ctypes.POINTER(ctypes.POINTER(mx_int))()
             aux_shape_data = ctypes.POINTER(ctypes.POINTER(mx_int))()
             if partial:
-                infer_func = _LIB.MXSymbolInferShapePartialEx
+                infer_func = _LIB.MXSymbolInferShapePartial
             else:
-                infer_func = _LIB.MXSymbolInferShapeEx
+                infer_func = _LIB.MXSymbolInferShape
             check_call(infer_func(
                 self.handle,
                 mx_uint(len(indptr) - 1),
diff --git a/python/mxnet/torch.py b/python/mxnet/torch.py
deleted file mode 100644
index 295c019166cf..000000000000
--- a/python/mxnet/torch.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-# coding: utf-8
-"""Interface for NDArray functions executed by torch backend.
-Install Torch and compile with USE_TORCH=1 to use this module."""
-
-import ctypes
-import sys
-from .base import _LIB
-from .base import c_array, c_str_array, c_handle_array, py_str, build_param_doc as _build_param_doc
-from .base import mx_uint, mx_float, FunctionHandle
-from .base import check_call
-from .ndarray import NDArray, _new_empty_handle
-
-try:
-    _LUAJIT = ctypes.CDLL("libluajit.so", mode=ctypes.RTLD_GLOBAL)
-except OSError:
-    _LUAJIT = None
-
-# pylint: disable=too-many-locals, invalid-name
-def _make_torch_function(handle):
-    """Create a Torch function from the FunctionHandle."""
-    # Get the property of function
-    n_used_vars = mx_uint()
-    n_scalars = mx_uint()
-    n_mutate_vars = mx_uint()
-    type_mask = ctypes.c_int()
-    check_call(_LIB.MXFuncDescribe(
-        handle,
-        ctypes.byref(n_used_vars),
-        ctypes.byref(n_scalars),
-        ctypes.byref(n_mutate_vars),
-        ctypes.byref(type_mask)))
-    n_mutate_vars = n_mutate_vars.value
-    n_used_vars = n_used_vars.value
-    n_scalars = n_scalars.value
-    type_mask = type_mask.value
-
-    # Get the information from the function
-    name = ctypes.c_char_p()
-    desc = ctypes.c_char_p()
-    num_args = mx_uint()
-    arg_names = ctypes.POINTER(ctypes.c_char_p)()
-    arg_types = ctypes.POINTER(ctypes.c_char_p)()
-    arg_descs = ctypes.POINTER(ctypes.c_char_p)()
-    ret_type = ctypes.c_char_p()
-
-    check_call(_LIB.MXFuncGetInfo(
-        handle, ctypes.byref(name), ctypes.byref(desc),
-        ctypes.byref(num_args),
-        ctypes.byref(arg_names),
-        ctypes.byref(arg_types),
-        ctypes.byref(arg_descs),
-        ctypes.byref(ret_type)))
-    func_name = py_str(name.value)
-    if not func_name.startswith('_th_'):
-        return None
-    narg = int(num_args.value)
-    param_str = _build_param_doc(
-        [py_str(arg_names[i]) for i in range(narg)],
-        [py_str(arg_types[i]) for i in range(narg)],
-        [py_str(arg_descs[i]) for i in range(narg)])
-
-    if n_mutate_vars > 1:
-        res = ','.join(['res%d '%i for i in range(n_mutate_vars)])
-    else:
-        res = 'res '
-    doc_str = (('Interface for Torch function {name}.\n' +
-                'Invoke with\n{res}= mxnet.th.{name}(Parameters)\nor\n'+
-                'mxnet.th.{name}({res}, Parameters).\n\n' +
-                '{param_str}\n' +
-                'References: ' +
-                'https://github.com/torch/torch7/blob/master/doc/maths.md\n').format(
-                    name=func_name[4:], param_str=param_str,
-                    res=res))
-
-    def generic_torch_function(*args, **kwargs):
-        """Invoke this function by passing in parameters.
-
-        Parameters
-        ----------
-        *args
-            Positional arguments of inputs (both scalar and `NDArray`).
-
-        Returns
-        -------
-        out : NDArray
-            The result NDArray(tuple) of result of computation.
-        """
-        ndargs = []
-        arg_format = ''
-        value = ''
-        for arg in args:
-            if isinstance(arg, NDArray):
-                ndargs.append(arg)
-                arg_format += 'n'
-                value += ','
-            elif isinstance(arg, int):
-                arg_format += 'i'
-                value += str(arg) + ','
-            elif isinstance(arg, str):
-                arg_format += 's'
-                value += str(arg) + ','
-            elif isinstance(arg, float):
-                arg_format += 'f'
-                value += str(arg) + ','
-            elif isinstance(arg, bool):
-                arg_format += 'b'
-                value += str(arg) + ','
-        value = value[:-1]
-        if len(ndargs) == n_used_vars:
-            ndargs = [NDArray(_new_empty_handle()) for _ in range(n_mutate_vars)] + ndargs
-            arg_format = 'n'*n_mutate_vars + arg_format
-            value = ','*n_mutate_vars + value
-        elif len(ndargs) == n_mutate_vars + n_used_vars:
-            pass
-        else:
-            raise AssertionError(('Incorrect number of input NDArrays. ' +
-                                  'Need to be either %d (inputs) or %d ' +
-                                  '(output buffer) + %d (input)') %
-                                 (n_used_vars, n_mutate_vars, n_used_vars))
-
-        kwargs['format'] = arg_format
-        kwargs['args'] = value
-
-        for k in kwargs:
-            kwargs[k] = str(kwargs[k])
-
-        check_call(_LIB.MXFuncInvokeEx(
-            handle,
-            c_handle_array(ndargs[n_mutate_vars:]), # pylint: disable=invalid-slice-index
-            c_array(mx_float, []),
-            c_handle_array(ndargs[:n_mutate_vars]),   # pylint: disable=invalid-slice-index
-            ctypes.c_int(len(kwargs)),
-            c_str_array(kwargs.keys()),
-            c_str_array(kwargs.values())))
-
-        if n_mutate_vars == 1:
-            return ndargs[0]
-        else:
-            return ndargs[:n_mutate_vars] # pylint: disable=invalid-slice-index
-
-    # End of function declaration
-    ret_function = generic_torch_function
-    ret_function.__name__ = func_name[4:]
-    ret_function.__doc__ = doc_str
-    return ret_function
-
-# pylint: enable=too-many-locals, invalid-name
-
-def _init_torch_module():
-    """List and add all the torch backed ndarray functions to current module."""
-    plist = ctypes.POINTER(FunctionHandle)()
-    size = ctypes.c_uint()
-    check_call(_LIB.MXListFunctions(ctypes.byref(size),
-                                    ctypes.byref(plist)))
-
-    module_obj = sys.modules[__name__]
-    for i in range(size.value):
-        hdl = FunctionHandle(plist[i])
-        function = _make_torch_function(hdl)
-        # if function name starts with underscore, register as static method of NDArray
-        if function is not None:
-            setattr(module_obj, function.__name__, function)
-
-# Initialize the NDArray module
-_init_torch_module()
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 23049f1b8867..30194494f599 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1638,42 +1638,26 @@ void CreateNDArray(const DataType* shape,
   *out = nd;
 }
 
+int MXNDArrayCreate64(const int64_t *shape,
+                      int ndim,
+                      int dev_type,
+                      int dev_id,
+                      int delay_alloc,
+                      int dtype,
+                      NDArrayHandle *out) {
+  API_BEGIN();
+  CreateNDArray<int64_t>(shape, ndim, dev_type, dev_id, delay_alloc, dtype, out);
+  API_END();
+}
+
 int MXNDArrayCreate(const uint32_t *shape,
                     uint32_t ndim,
                     int dev_type,
                     int dev_id,
                     int delay_alloc,
+                    int dtype,
                     NDArrayHandle *out) {
   API_BEGIN();
-  NDArray* nd = new NDArray(mxnet::TShape(shape, shape + ndim),
-                            Context::Create(static_cast<Context::DeviceType>(dev_type), dev_id),
-                            delay_alloc != 0);
-  nd->AssignStorageInfo(profiler::ProfilerScope::Get()->GetCurrentProfilerScope(),
-                        MXNET_STORAGE_DEFAULT_NAME_CSTR);
-  *out = nd;
-  API_END();
-}
-
-int MXNDArrayCreateEx64(const int64_t *shape,
-                        int ndim,
-                        int dev_type,
-                        int dev_id,
-                        int delay_alloc,
-                        int dtype,
-                        NDArrayHandle *out) {
-  API_BEGIN();
-  CreateNDArray<int64_t>(shape, ndim, dev_type, dev_id, delay_alloc, dtype, out);
-  API_END();
-}
-
-int MXNDArrayCreateEx(const uint32_t *shape,
-                      uint32_t ndim,
-                      int dev_type,
-                      int dev_id,
-                      int delay_alloc,
-                      int dtype,
-                      NDArrayHandle *out) {
-  API_BEGIN();
   CreateNDArray<uint32_t>(shape, static_cast<int>(ndim), dev_type, dev_id, delay_alloc, dtype, out);
   API_END();
 }
@@ -2041,25 +2025,6 @@ int MXNDArrayGetStorageType(NDArrayHandle handle,
   API_END();
 }
 
-int MXNDArrayGetShape(NDArrayHandle handle,
-                      uint32_t *out_dim,
-                      const uint32_t **out_pdata) {
-  MXAPIThreadLocalEntry<> *ret = MXAPIThreadLocalStore<>::Get();
-  API_BEGIN();
-  NDArray *arr = static_cast<NDArray*>(handle);
-  if (!arr->is_none()) {
-    const mxnet::TShape &s = arr->shape();
-    *out_dim = s.ndim();
-    std::vector<uint32_t>& buffer = ret->arg_shape_buffer;
-    buffer.resize(s.ndim());
-    nnvm::ShapeTypeCast(s.begin(), s.end(), buffer.data());
-    *out_pdata = buffer.data();
-  } else {
-    *out_dim = 0;
-  }
-  API_END();
-}
-
 template<typename dtype>
 inline void GetShape(NDArrayHandle handle, const dtype** out_pdata, int* out_dim,
                      MXAPIThreadLocalEntry<dtype>* ret) {
@@ -2099,18 +2064,18 @@ inline void GetShape(NDArrayHandle handle, const dtype** out_pdata, int* out_dim
   }
 }
 
-int MXNDArrayGetShapeEx(NDArrayHandle handle,
-                        int *out_dim,
-                        const int **out_pdata) {
+int MXNDArrayGetShape(NDArrayHandle handle,
+                      int *out_dim,
+                      const int **out_pdata) {
   MXAPIThreadLocalEntry<> *ret = MXAPIThreadLocalStore<>::Get();
   API_BEGIN();
   GetShape<int>(handle, out_pdata, out_dim, ret);
   API_END();
 }
 
-int MXNDArrayGetShapeEx64(NDArrayHandle handle,
-                          int *out_dim,
-                          const int64_t **out_pdata) {
+int MXNDArrayGetShape64(NDArrayHandle handle,
+                        int *out_dim,
+                        const int64_t **out_pdata) {
   MXAPIThreadLocalEntry<int64_t> *ret = MXAPIThreadLocalStore<int64_t>::Get();
   API_BEGIN();
   GetShape<int64_t>(handle, out_pdata, out_dim, ret);
@@ -2144,13 +2109,8 @@ int MXNDArrayToDLPack(NDArrayHandle handle,
 }
 
 int MXNDArrayFromDLPack(DLManagedTensorHandle dlpack,
+                        const bool transient_handle,
                         NDArrayHandle *out_handle) {
-  return MXNDArrayFromDLPackEx(dlpack, false, out_handle);
-}
-
-int MXNDArrayFromDLPackEx(DLManagedTensorHandle dlpack,
-                          const bool transient_handle,
-                          NDArrayHandle *out_handle) {
   API_BEGIN();
   *out_handle = new NDArray(NDArray::FromDLPack(
               static_cast<DLManagedTensor*>(dlpack),
@@ -2310,21 +2270,6 @@ int MXFuncDescribe(FunctionHandle fun,
 }
 
 int MXFuncInvoke(FunctionHandle fun,
-                 NDArrayHandle *use_vars,
-                 float *scalar_args,
-                 NDArrayHandle *mutate_vars) {
-  API_BEGIN();
-  auto *f = static_cast<const NDArrayFunctionReg*>(fun);
-  f->body((NDArray**)(use_vars),  //  NOLINT(*)
-          scalar_args,
-          (NDArray**)(mutate_vars),  //  NOLINT(*)
-          0,
-          nullptr,
-          nullptr);
-  API_END();
-}
-
-int MXFuncInvokeEx(FunctionHandle fun,
                  NDArrayHandle *use_vars,
                  float *scalar_args,
                  NDArrayHandle *mutate_vars,
@@ -3347,18 +3292,8 @@ int MXNDArrayGetSharedMemHandle(NDArrayHandle handle, int* shared_pid, int* shar
   API_END();
 }
 
-int MXNDArrayCreateFromSharedMem(int shared_pid, int shared_id, const uint32_t *shape,
-                                 uint32_t ndim, int dtype, NDArrayHandle *out) {
-  API_BEGIN();
-  NDArray* nd = new NDArray(shared_pid, shared_id, mxnet::TShape(shape, shape + ndim), dtype);
-  nd->AssignStorageInfo(profiler::ProfilerScope::Get()->GetCurrentProfilerScope(),
-                        MXNET_STORAGE_DEFAULT_NAME_CSTR);
-  *out = nd;
-  API_END();
-}
-
-int MXNDArrayCreateFromSharedMemEx(int shared_pid, int shared_id, const int *shape,
-                                   int ndim, int dtype, NDArrayHandle *out) {
+int MXNDArrayCreateFromSharedMem(int shared_pid, int shared_id, const int *shape,
+                                 int ndim, int dtype, NDArrayHandle *out) {
   API_BEGIN();
   NDArray* nd = new NDArray(shared_pid, shared_id, mxnet::TShape(shape, shape + ndim), dtype);
   nd->AssignStorageInfo(profiler::ProfilerScope::Get()->GetCurrentProfilerScope(),
diff --git a/src/c_api/c_api_ndarray.cc b/src/c_api/c_api_ndarray.cc
index ebb3134ae7f3..95346e897b56 100644
--- a/src/c_api/c_api_ndarray.cc
+++ b/src/c_api/c_api_ndarray.cc
@@ -143,22 +143,8 @@ int MXImperativeInvoke(AtomicSymbolCreator creator,
                        NDArrayHandle **outputs,
                        int num_params,
                        const char **param_keys,
-                       const char **param_vals) {
-  API_BEGIN();
-  MXImperativeInvokeImpl(creator, num_inputs, inputs, num_outputs, outputs,
-                         num_params, param_keys, param_vals);
-  API_END();
-}
-
-int MXImperativeInvokeEx(AtomicSymbolCreator creator,
-                         int num_inputs,
-                         NDArrayHandle *inputs,
-                         int *num_outputs,
-                         NDArrayHandle **outputs,
-                         int num_params,
-                         const char **param_keys,
-                         const char **param_vals,
-                         const int **out_stypes) {  // outputs storage types
+                       const char **param_vals,
+                       const int **out_stypes) {  // outputs storage types
   MXAPIThreadLocalEntry<> *ret = MXAPIThreadLocalStore<>::Get();
   API_BEGIN();
   MXImperativeInvokeImpl(creator, num_inputs, inputs, num_outputs, outputs,
@@ -174,41 +160,11 @@ int MXImperativeInvokeEx(AtomicSymbolCreator creator,
 }
 
 int MXCreateCachedOp(SymbolHandle handle,
-                     CachedOpHandle *out) {
-  nnvm::Symbol* sym = static_cast<nnvm::Symbol*>(handle);
-
-  API_BEGIN();
-  auto inputs = sym->ListInputs(nnvm::Symbol::kAll);
-  std::vector<std::string> input_names;
-  input_names.reserve(inputs.size());
-  for (const auto& i : inputs) input_names.push_back(i->attrs.name);
-  *out = new CachedOpPtr(new CachedOp(
-      *sym, std::vector<std::pair<std::string, std::string> >()));
-  API_END();
-}
-
-int MXCreateCachedOpEx(SymbolHandle handle,
-                       int num_flags,
-                       const char** keys,
-                       const char** vals,
-                       CachedOpHandle *out) {
-  nnvm::Symbol* sym = static_cast<nnvm::Symbol*>(handle);
-
-  API_BEGIN();
-  std::vector<std::pair<std::string, std::string> > flags;
-  for (int i = 0; i < num_flags; ++i) {
-    flags.emplace_back(keys[i], vals[i]);
-  }
-  *out = new CachedOpPtr(new CachedOp(*sym, flags));
-  API_END();
-}
-
-int MXCreateCachedOpEX(SymbolHandle handle,
-                       int num_flags,
-                       const char** keys,
-                       const char** vals,
-                       CachedOpHandle *out,
-                       bool thread_safe) {
+                     int num_flags,
+                     const char** keys,
+                     const char** vals,
+                     CachedOpHandle *out,
+                     bool thread_safe) {
   nnvm::Symbol* sym = static_cast<nnvm::Symbol*>(handle);
   API_BEGIN();
   std::vector<std::pair<std::string, std::string> > flags;
@@ -243,14 +199,14 @@ int MXCachedOpGetOptimizedSymbol(CachedOpHandle handle,
   API_END_HANDLE_ERROR(delete s);
 }
 
-int MXInvokeCachedOpEx(CachedOpHandle handle,
-                       int num_inputs,
-                       NDArrayHandle *inputs,
-                       int default_dev_type,
-                       int default_dev_id,
-                       int *num_outputs,
-                       NDArrayHandle **outputs,
-                       const int **out_stypes) {  // outputs storage types
+int MXInvokeCachedOp(CachedOpHandle handle,
+                     int num_inputs,
+                     NDArrayHandle *inputs,
+                     int default_dev_type,
+                     int default_dev_id,
+                     int *num_outputs,
+                     NDArrayHandle **outputs,
+                     const int **out_stypes) {  // outputs storage types
   MXAPIThreadLocalEntry<> *ret = MXAPIThreadLocalStore<>::Get();
 
   API_BEGIN();
diff --git a/src/c_api/c_api_profile.cc b/src/c_api/c_api_profile.cc
index 79d11b92dff6..bdc7664fc061 100644
--- a/src/c_api/c_api_profile.cc
+++ b/src/c_api/c_api_profile.cc
@@ -316,12 +316,8 @@ int MXSetProfilerConfig(int num_params, const char* const* keys, const char* con
   return MXSetProcessProfilerConfig(num_params, keys, vals, nullptr);
 }
 
-int MXAggregateProfileStatsPrint(const char **out_str, int reset) {
-  return MXAggregateProfileStatsPrintEx(out_str, reset, 0, 0, 0);
-}
-
-int MXAggregateProfileStatsPrintEx(const char **out_str, int reset, int format, int sort_by,
-                                  int ascending) {
+int MXAggregateProfileStatsPrint(const char **out_str, int reset, int format, int sort_by,
+                                 int ascending) {
   MXAPIThreadLocalEntry<> *ret = MXAPIThreadLocalStore<>::Get();
   API_BEGIN();
     CHECK_NOTNULL(out_str);
diff --git a/src/c_api/c_api_symbolic.cc b/src/c_api/c_api_symbolic.cc
index caee7626c8dd..6f5f03a59a15 100644
--- a/src/c_api/c_api_symbolic.cc
+++ b/src/c_api/c_api_symbolic.cc
@@ -571,79 +571,6 @@ void MatchArguments(
 
 }  // namespace mxnet
 
-int MXSymbolInferShape(SymbolHandle sym,
-                       uint32_t num_args,
-                       const char** keys,
-                       const uint32_t *arg_ind_ptr,
-                       const uint32_t *arg_shape_data,
-                       uint32_t *in_shape_size,
-                       const uint32_t **in_shape_ndim,
-                       const uint32_t ***in_shape_data,
-                       uint32_t *out_shape_size,
-                       const uint32_t **out_shape_ndim,
-                       const uint32_t ***out_shape_data,
-                       uint32_t *aux_shape_size,
-                       const uint32_t **aux_shape_ndim,
-                       const uint32_t ***aux_shape_data,
-                       int *complete) {
-  nnvm::Symbol *s = static_cast<nnvm::Symbol*>(sym);
-  MXAPIThreadLocalEntry<> *ret = MXAPIThreadLocalStore<>::Get();
-  API_BEGIN();
-  nnvm::Graph g = Symbol2Graph(*s);
-  mxnet::ShapeVector arg_shapes(g.indexed_graph().input_nodes().size(), mxnet::TShape());
-  if (keys == nullptr && num_args != 0) {
-    std::vector<uint32_t> read_only_args = mxnet::ReadOnlyArgIndices(g.indexed_graph());
-    CHECK_LE(num_args, read_only_args.size());
-    for (uint32_t i = 0; i < num_args; ++i) {
-      arg_shapes[read_only_args[i]] = mxnet::ShapeTypeCast(
-          arg_shape_data + arg_ind_ptr[i], arg_shape_data + arg_ind_ptr[i+1]);
-    }
-  } else {
-    std::unordered_map<std::string, mxnet::TShape> kwargs;
-    for (uint32_t i = 0; i < num_args; ++i) {
-      kwargs[keys[i]] = mxnet::ShapeTypeCast(
-          arg_shape_data + arg_ind_ptr[i], arg_shape_data + arg_ind_ptr[i+1]);
-    }
-    mxnet::MatchArguments(g.indexed_graph(), kwargs, &arg_shapes, "InferShape");
-  }
-
-  try {
-    g = mxnet::exec::InferShape(std::move(g), std::move(arg_shapes), "__shape__");
-  } catch (const mxnet::op::InferShapeError &err) {
-    throw dmlc::Error(err.msg);
-  }
-
-  // if use legacy shape definition, need to convert numpy shape to legacy shape
-  mxnet::ShapeVector shapes = g.GetAttr<mxnet::ShapeVector>("shape");
-  if (!Imperative::Get()->is_np_shape()) {
-    common::ConvertToLegacyShape(&shapes);
-  }
-
-  // copy back
-  CopyAttr(g.indexed_graph(), shapes,
-           &(ret->arg_shapes), &(ret->out_shapes), &(ret->aux_shapes));
-
-  // copy data back
-  MXAPIThreadLocalEntry<>::SetupShapeArrayReturnWithBuffer(ret->arg_shapes,
-      &(ret->arg_shape_ndim), &(ret->arg_shape_data), &(ret->arg_shape_buffer));
-  MXAPIThreadLocalEntry<>::SetupShapeArrayReturnWithBuffer(ret->out_shapes,
-      &(ret->out_shape_ndim), &(ret->out_shape_data), &(ret->out_shape_buffer));
-  MXAPIThreadLocalEntry<>::SetupShapeArrayReturnWithBuffer(ret->aux_shapes,
-      &(ret->aux_shape_ndim), &(ret->aux_shape_data), &(ret->aux_shape_buffer));
-  *in_shape_size = static_cast<uint32_t>(ret->arg_shapes.size());
-  *in_shape_ndim = dmlc::BeginPtr(ret->arg_shape_ndim);
-  *in_shape_data = dmlc::BeginPtr(ret->arg_shape_data);
-  *out_shape_size = static_cast<uint32_t>(ret->out_shapes.size());
-  *out_shape_ndim = dmlc::BeginPtr(ret->out_shape_ndim);
-  *out_shape_data = dmlc::BeginPtr(ret->out_shape_data);
-  *aux_shape_size = static_cast<uint32_t>(ret->aux_shapes.size());
-  *aux_shape_ndim = dmlc::BeginPtr(ret->aux_shape_ndim);
-  *aux_shape_data = dmlc::BeginPtr(ret->aux_shape_data);
-  // mark complete
-  *complete = (g.GetAttr<size_t>("shape_num_unknown_nodes") == 0);
-  API_END();
-}
-
 template<typename dtype, typename stype, typename itype>
 inline void SymbolInferShape(const char** keys,
                              uint32_t num_args,
@@ -737,21 +664,21 @@ inline void SymbolInferShape(const char** keys,
  * \param complete indicates completion of Shape Inference
  * \return 0 when success, -1 when failure happens
  */
-int MXSymbolInferShapeEx(SymbolHandle sym,
-                         uint32_t num_args,
-                         const char** keys,
-                         const uint32_t *arg_ind_ptr,
-                         const int *arg_shape_data,
-                         uint32_t *in_shape_size,
-                         const int **in_shape_ndim,
-                         const int ***in_shape_data,
-                         uint32_t *out_shape_size,
-                         const int **out_shape_ndim,
-                         const int ***out_shape_data,
-                         uint32_t *aux_shape_size,
-                         const int **aux_shape_ndim,
-                         const int ***aux_shape_data,
-                         int *complete) {
+int MXSymbolInferShape(SymbolHandle sym,
+                       uint32_t num_args,
+                       const char** keys,
+                       const uint32_t *arg_ind_ptr,
+                       const int *arg_shape_data,
+                       uint32_t *in_shape_size,
+                       const int **in_shape_ndim,
+                       const int ***in_shape_data,
+                       uint32_t *out_shape_size,
+                       const int **out_shape_ndim,
+                       const int ***out_shape_data,
+                       uint32_t *aux_shape_size,
+                       const int **aux_shape_ndim,
+                       const int ***aux_shape_data,
+                       int *complete) {
   nnvm::Symbol *s = static_cast<nnvm::Symbol*>(sym);
   MXAPIThreadLocalEntry<> *ret = MXAPIThreadLocalStore<>::Get();
   API_BEGIN();
@@ -795,21 +722,21 @@ int MXSymbolInferShapeEx(SymbolHandle sym,
  * \param complete indicates completion of Shape Inference
  * \return 0 when success, -1 when failure happens
  */
-int MXSymbolInferShapeEx64(SymbolHandle sym,
-                           uint32_t num_args,
-                           const char** keys,
-                           const int64_t *arg_ind_ptr,
-                           const int64_t *arg_shape_data,
-                           size_t *in_shape_size,
-                           const int **in_shape_ndim,
-                           const int64_t ***in_shape_data,
-                           size_t *out_shape_size,
-                           const int **out_shape_ndim,
-                           const int64_t ***out_shape_data,
-                           size_t *aux_shape_size,
-                           const int **aux_shape_ndim,
-                           const int64_t ***aux_shape_data,
-                           int *complete) {
+int MXSymbolInferShape64(SymbolHandle sym,
+                         uint32_t num_args,
+                         const char** keys,
+                         const int64_t *arg_ind_ptr,
+                         const int64_t *arg_shape_data,
+                         size_t *in_shape_size,
+                         const int **in_shape_ndim,
+                         const int64_t ***in_shape_data,
+                         size_t *out_shape_size,
+                         const int **out_shape_ndim,
+                         const int64_t ***out_shape_data,
+                         size_t *aux_shape_size,
+                         const int **aux_shape_ndim,
+                         const int64_t ***aux_shape_data,
+                         int *complete) {
   nnvm::Symbol *s = static_cast<nnvm::Symbol*>(sym);
   MXAPIThreadLocalEntry<int64_t> *ret = MXAPIThreadLocalStore<int64_t>::Get();
   API_BEGIN();
@@ -832,20 +759,41 @@ int MXSymbolInferShapeEx64(SymbolHandle sym,
   API_END();
 }
 
+/*!
+ * \brief Executor for Symbol Partial Shape Inference
+ *  This api is available when MXNet is built with flag
+ *  USE_INT64_TENSOR_SIZE=0 (by default)
+ * \param sym symbol handle
+ * \param num_args number of args
+ * \param keys keys
+ * \param arg_ind_ptr arg index pointer
+ * \param arg_shape_data arg shape data
+ * \param in_shape_size input shape size
+ * \param in_shape_ndim input shape number of dims
+ * \param in_shape_data input shape data
+ * \param out_shape_size ouput shape size
+ * \param out_shape_ndim output shape number of dims
+ * \param out_shape_data output shape data
+ * \param aux_shape_size shape size of auxiliary states
+ * \param aux_shape_ndim number of dims of auxiliary states shape
+ * \param aux_shape_data shape data of auxiliary states
+ * \param complete indicates completion of Shape Inference
+ * \return 0 when success, -1 when failure happens
+ */
 int MXSymbolInferShapePartial(SymbolHandle sym,
                               uint32_t num_args,
                               const char** keys,
                               const uint32_t *arg_ind_ptr,
-                              const uint32_t *arg_shape_data,
+                              const int *arg_shape_data,
                               uint32_t *in_shape_size,
-                              const uint32_t **in_shape_ndim,
-                              const uint32_t ***in_shape_data,
+                              const int **in_shape_ndim,
+                              const int ***in_shape_data,
                               uint32_t *out_shape_size,
-                              const uint32_t **out_shape_ndim,
-                              const uint32_t ***out_shape_data,
+                              const int **out_shape_ndim,
+                              const int ***out_shape_data,
                               uint32_t *aux_shape_size,
-                              const uint32_t **aux_shape_ndim,
-                              const uint32_t ***aux_shape_data,
+                              const int **aux_shape_ndim,
+                              const int ***aux_shape_data,
                               int *complete) {
   int succ = 0;
   *complete = 1;
@@ -860,7 +808,7 @@ int MXSymbolInferShapePartial(SymbolHandle sym,
 /*!
  * \brief Executor for Symbol Partial Shape Inference
  *  This api is available when MXNet is built with flag
- *  USE_INT64_TENSOR_SIZE=0 (by default)
+ *  USE_INT64_TENSOR_SIZE=1 (not default) i.e. Large Tensor Support
  * \param sym symbol handle
  * \param num_args number of args
  * \param keys keys
@@ -878,24 +826,24 @@ int MXSymbolInferShapePartial(SymbolHandle sym,
  * \param complete indicates completion of Shape Inference
  * \return 0 when success, -1 when failure happens
  */
-int MXSymbolInferShapePartialEx(SymbolHandle sym,
+int MXSymbolInferShapePartial64(SymbolHandle sym,
                                 uint32_t num_args,
                                 const char** keys,
-                                const uint32_t *arg_ind_ptr,
-                                const int *arg_shape_data,
-                                uint32_t *in_shape_size,
+                                const int64_t *arg_ind_ptr,
+                                const int64_t *arg_shape_data,
+                                size_t *in_shape_size,
                                 const int **in_shape_ndim,
-                                const int ***in_shape_data,
-                                uint32_t *out_shape_size,
+                                const int64_t ***in_shape_data,
+                                size_t *out_shape_size,
                                 const int **out_shape_ndim,
-                                const int ***out_shape_data,
-                                uint32_t *aux_shape_size,
+                                const int64_t ***out_shape_data,
+                                size_t *aux_shape_size,
                                 const int **aux_shape_ndim,
-                                const int ***aux_shape_data,
+                                const int64_t ***aux_shape_data,
                                 int *complete) {
   int succ = 0;
   *complete = 1;
-  return MXSymbolInferShapeEx(sym, num_args, keys,
+  return MXSymbolInferShape64(sym, num_args, keys,
                               arg_ind_ptr, arg_shape_data,
                               in_shape_size, in_shape_ndim, in_shape_data,
                               out_shape_size, out_shape_ndim, out_shape_data,
@@ -903,52 +851,6 @@ int MXSymbolInferShapePartialEx(SymbolHandle sym,
                               &succ);
 }
 
-/*!
- * \brief Executor for Symbol Partial Shape Inference
- *  This api is available when MXNet is built with flag
- *  USE_INT64_TENSOR_SIZE=1 (not default) i.e. Large Tensor Support
- * \param sym symbol handle
- * \param num_args number of args
- * \param keys keys
- * \param arg_ind_ptr arg index pointer
- * \param arg_shape_data arg shape data
- * \param in_shape_size input shape size
- * \param in_shape_ndim input shape number of dims
- * \param in_shape_data input shape data
- * \param out_shape_size ouput shape size
- * \param out_shape_ndim output shape number of dims
- * \param out_shape_data output shape data
- * \param aux_shape_size shape size of auxiliary states
- * \param aux_shape_ndim number of dims of auxiliary states shape
- * \param aux_shape_data shape data of auxiliary states
- * \param complete indicates completion of Shape Inference
- * \return 0 when success, -1 when failure happens
- */
-int MXSymbolInferShapePartialEx64(SymbolHandle sym,
-                                  uint32_t num_args,
-                                  const char** keys,
-                                  const int64_t *arg_ind_ptr,
-                                  const int64_t *arg_shape_data,
-                                  size_t *in_shape_size,
-                                  const int **in_shape_ndim,
-                                  const int64_t ***in_shape_data,
-                                  size_t *out_shape_size,
-                                  const int **out_shape_ndim,
-                                  const int64_t ***out_shape_data,
-                                  size_t *aux_shape_size,
-                                  const int **aux_shape_ndim,
-                                  const int64_t ***aux_shape_data,
-                                  int *complete) {
-  int succ = 0;
-  *complete = 1;
-  return MXSymbolInferShapeEx64(sym, num_args, keys,
-                                arg_ind_ptr, arg_shape_data,
-                                in_shape_size, in_shape_ndim, in_shape_data,
-                                out_shape_size, out_shape_ndim, out_shape_data,
-                                aux_shape_size, aux_shape_ndim, aux_shape_data,
-                                &succ);
-}
-
 int MXSymbolInferType(SymbolHandle sym,
                       uint32_t num_args,
                       const char** keys,
diff --git a/src/operator/convolution_v1-inl.h b/src/operator/convolution_v1-inl.h
deleted file mode 100644
index 0b9981737634..000000000000
--- a/src/operator/convolution_v1-inl.h
+++ /dev/null
@@ -1,556 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file convolution_v1-inl.h
- * \brief
- * \author Bing Xu
-*/
-#ifndef MXNET_OPERATOR_CONVOLUTION_V1_INL_H_
-#define MXNET_OPERATOR_CONVOLUTION_V1_INL_H_
-
-#include <mxnet/io.h>
-#include <mxnet/base.h>
-#include <mxnet/ndarray.h>
-#include <mxnet/operator.h>
-#include <dmlc/logging.h>
-#include <dmlc/optional.h>
-#include <algorithm>
-#include <map>
-#include <vector>
-#include <string>
-#include <utility>
-#include "./operator_common.h"
-#include "./linalg.h"
-
-namespace mxnet {
-namespace op {
-
-namespace conv_v1 {
-enum ConvolutionV1OpInputs {kData, kWeight, kBias};
-enum ConvolutionV1OpOutputs {kOut};
-enum ConvolutionV1OpResource {kTempSpace};
-enum ConvolutionV1OpCudnnTune {kOff, kLimited, kFastest};
-}
-
-struct ConvolutionV1Param : public dmlc::Parameter<ConvolutionV1Param> {
-  mxnet::TShape kernel;
-  mxnet::TShape stride;
-  mxnet::TShape dilate;
-  mxnet::TShape pad;
-  uint32_t num_filter;
-  uint32_t num_group;
-  uint64_t workspace;
-  bool no_bias;
-  dmlc::optional<int> cudnn_tune;
-  bool cudnn_off;
-  dmlc::optional<int> layout;
-  DMLC_DECLARE_PARAMETER(ConvolutionV1Param) {
-    DMLC_DECLARE_FIELD(kernel).describe("convolution kernel size: (h, w) or (d, h, w)");
-    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape(0, 0))
-    .describe("convolution stride: (h, w) or (d, h, w)");
-    DMLC_DECLARE_FIELD(dilate).set_default(mxnet::TShape(0, 0))
-    .describe("convolution dilate: (h, w) or (d, h, w)");
-    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape(0, 0))
-    .describe("pad for convolution: (h, w) or (d, h, w)");
-    DMLC_DECLARE_FIELD(num_filter).set_lower_bound(1)
-    .describe("convolution filter(channel) number");
-    DMLC_DECLARE_FIELD(num_group).set_default(1)
-    .describe("Number of group partitions. Equivalent to slicing input into num_group\n    "
-              "partitions, apply convolution on each, then concatenate the results");
-    DMLC_DECLARE_FIELD(workspace).set_default(1024).set_lower_bound(0)
-    .describe("Maximum temporary workspace allowed for convolution (MB)."
-              "This parameter determines the effective batch size of the convolution "
-              "kernel, which may be smaller than the given batch size. "
-              "Also, the workspace will be automatically enlarged to make sure that we can "
-              "run the kernel with batch_size=1");
-    DMLC_DECLARE_FIELD(no_bias).set_default(false)
-    .describe("Whether to disable bias parameter.");
-    DMLC_DECLARE_FIELD(cudnn_tune)
-    .add_enum("off", conv_v1::kOff)
-    .add_enum("limited_workspace", conv_v1::kLimited)
-    .add_enum("fastest", conv_v1::kFastest)
-    .set_default(dmlc::optional<int>())
-    .describe("Whether to pick convolution algo by running performance test.\n    "
-              "Leads to higher startup time but may give faster speed. Options are:\n    "
-              "\'off\': no tuning\n    "
-              "\'limited_workspace\': run test and pick the fastest algorithm "
-              "that doesn't exceed workspace limit.\n    "
-              "\'fastest\': pick the fastest algorithm and ignore workspace limit.\n    "
-              "If set to None (default), behavior is determined by environment\n    "
-              "variable MXNET_CUDNN_AUTOTUNE_DEFAULT: 0 for off,\n    "
-              "1 for limited workspace (default), 2 for fastest.");
-    DMLC_DECLARE_FIELD(cudnn_off).set_default(false)
-    .describe("Turn off cudnn for this layer.");
-    DMLC_DECLARE_FIELD(layout)
-    .add_enum("NCHW", mshadow::kNCHW)
-    .add_enum("NHWC", mshadow::kNHWC)
-    .add_enum("NCDHW", mshadow::kNCDHW)
-    .add_enum("NDHWC", mshadow::kNDHWC)
-    .set_default(dmlc::optional<int>())
-    .describe("Set layout for input, output and weight. Empty for\n    "
-              "default layout: NCHW for 2d and NCDHW for 3d.");
-  }
-};
-
-template<typename xpu, typename DType>
-class ConvolutionV1Op : public Operator {
- public:
-  explicit ConvolutionV1Op(ConvolutionV1Param p) {
-    this->param_ = p;
-    // convert MBytes first to Bytes and then to elements.
-    param_.workspace = (param_.workspace << 20) / sizeof(DType);
-    CHECK(param_.layout.value() == mshadow::kNCHW ||
-          param_.layout.value() == mshadow::kNCDHW)
-      << "Only support NCHW and NCDHW layout";
-  }
-
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(req[conv_v1::kOut], kWriteTo);
-    size_t expected = param_.no_bias ? 2 : 3;
-    CHECK_EQ(in_data.size(), expected);
-    CHECK_EQ(out_data.size(), 1U);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    if (param_.kernel.ndim() > 2) {
-      LOG(FATAL) << "Volume convolution is not implmented in mshadow";
-    }
-    Tensor<xpu, 4, DType> data = in_data[conv_v1::kData].get<xpu, 4, DType>(s);
-    Shape<3> wmat_shape =
-        Shape3(param_.num_group,
-               param_.num_filter / param_.num_group,
-               data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]);
-    Tensor<xpu, 3, DType> wmat =
-        in_data[conv_v1::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
-    Tensor<xpu, 4, DType> out = out_data[conv_v1::kOut].get<xpu, 4, DType>(s);
-#if defined(__CUDACC__)
-    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
-        << "Must init CuBLAS handle in stream";
-#endif
-    const index_t nbatch = data.size(0);
-    Tensor<xpu, 1, DType> workspace =
-        ctx.requested[conv_v1::kTempSpace].get_space_typed<xpu, 1, DType>(
-            Shape1(this->InitTemp(data.shape_, out.shape_)), s);
-    for (index_t i = 0; i < nbatch; i += nstep_) {
-      const index_t step = std::min(nstep_, nbatch - i);
-      Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(workspace.dptr_,
-                                               Shape2(shape_colunit_[0],
-                                                      shape_colunit_[1] * step), s);
-      Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
-                                               workspace.dptr_ + temp_col.shape_.Size(),
-                                               Shape3(shape_dstunit_[0],
-                                                      shape_dstunit_[1],
-                                                      shape_dstunit_[2] * step), s);
-      if (param_.pad[0] == 0 && param_.pad[1] == 0) {
-        temp_col = unpack_patch2col(data.Slice(i, i + step),
-                                    param_.kernel[0],
-                                    param_.kernel[1],
-                                    param_.stride[0],
-                                    param_.stride[1],
-                                    param_.dilate[0],
-                                    param_.dilate[1]);
-      } else {
-        temp_col = unpack_patch2col(pad(data.Slice(i, i + step),
-                                    param_.pad[0], param_.pad[1]),
-                                    param_.kernel[0],
-                                    param_.kernel[1],
-                                    param_.stride[0],
-                                    param_.stride[1],
-                                    param_.dilate[0],
-                                    param_.dilate[1]);
-      }
-
-      const index_t gstride = temp_col.size(0) / param_.num_group;
-      for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
-        mshadow::Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid,
-                                       gstride * (gid + 1));
-        // Legacy approach shown here for comparison:
-        //   temp_dst[gid] = dot(wmat[gid], tmpc);
-        linalg_gemm(wmat[gid], tmpc, temp_dst[gid], false, false, s);
-      }
-      out.Slice(i, i + step) = swapaxis<1, 0>(reshape(temp_dst,
-                                              mshadow::Shape4(param_.num_filter,
-                                                  step,
-                                                  out.size(2),
-                                                  out.size(3))));
-    }
-    if (!param_.no_bias) {
-      // add bias, broadcast bias to dim 1: channel
-      Tensor<xpu, 1, DType> bias = in_data[conv_v1::kBias].get<xpu, 1, DType>(s);
-      out += broadcast<1>(bias, out.shape_);
-    }
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    // TODO(bing): check the BLAS Handle, be careful
-    if (param_.kernel.ndim() > 2) {
-      LOG(FATAL) << "Volume convolution is not implmented in mshadow";
-    }
-    CHECK_EQ(out_grad.size(), 1);
-    size_t expected = param_.no_bias == 0 ? 3 : 2;
-    CHECK(in_data.size() == expected && in_grad.size() == expected);
-    CHECK_EQ(req.size(), expected);
-    CHECK_EQ(in_data[conv_v1::kWeight].CheckContiguous(), true);
-    // get data
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> data = in_data[conv_v1::kData].get<xpu, 4, DType>(s);
-    Shape<3> wmat_shape =
-        Shape3(param_.num_group,
-               param_.num_filter / param_.num_group,
-               data.shape_[1] / param_.num_group * param_.kernel[0] * param_.kernel[1]);
-    Tensor<xpu, 3, DType> wmat =
-        in_data[conv_v1::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
-    Tensor<xpu, 4, DType> grad = out_grad[conv_v1::kOut].get<xpu, 4, DType>(s);
-    Tensor<xpu, 4, DType> gdata = in_grad[conv_v1::kData].get<xpu, 4, DType>(s);
-    Tensor<xpu, 3, DType> gwmat =
-        in_grad[conv_v1::kWeight].get_with_shape<xpu, 3, DType>(wmat_shape, s);
-#if defined(__CUDACC__)
-    CHECK_EQ(s->blas_handle_ownership_, Stream<xpu>::OwnHandle)
-        << "Must init CuBLAS handle in stream";
-#endif
-    const index_t nbatch = data.size(0);
-    Tensor<xpu, 1, DType> workspace =
-        ctx.requested[conv_v1::kTempSpace].get_space_typed<xpu, 1, DType>(
-            Shape1(this->InitTemp(data.shape_, grad.shape_)), s);
-    for (index_t i = 0; i < nbatch; i += nstep_) {
-      const index_t step = std::min(nstep_, nbatch - i);
-      Tensor<xpu, 2, DType> temp_col = Tensor<xpu, 2, DType>(workspace.dptr_,
-                                               Shape2(shape_colunit_[0],
-                                                      shape_colunit_[1] * step), s);
-      Tensor<xpu, 3, DType> temp_dst = Tensor<xpu, 3, DType>(
-                                               workspace.dptr_ + temp_col.shape_.Size(),
-                                               Shape3(shape_dstunit_[0],
-                                                      shape_dstunit_[1],
-                                                      shape_dstunit_[2] * step), s);
-      temp_dst = reshape(swapaxis<1, 0>(grad.Slice(i, i + step)), temp_dst.shape_);
-      if (param_.pad[0] == 0 && param_.pad[1] == 0) {
-        temp_col = unpack_patch2col(data.Slice(i, i + step),
-                                     param_.kernel[0],
-                                     param_.kernel[1],
-                                     param_.stride[0],
-                                     param_.stride[1],
-                                     param_.dilate[0],
-                                     param_.dilate[1]);
-      } else {
-        temp_col = unpack_patch2col(pad(data.Slice(i, i + step), param_.pad[0], param_.pad[1]),
-                                     param_.kernel[0],
-                                     param_.kernel[1],
-                                     param_.stride[0],
-                                     param_.stride[1],
-                                     param_.dilate[0],
-                                     param_.dilate[1]);
-      }
-      const index_t gstride = temp_col.size(0) / param_.num_group;
-      for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
-        Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
-        if (i == 0) {
-          Tensor<xpu, 2, DType> tmp_gwmat = gwmat[gid];
-          // Legacy approach shown here for comparison:
-          //   Assign(tmp_gwmat, req[conv_v1::kWeight], dot(temp_dst[gid], tmpc.T()));
-          linalg_gemm(temp_dst[gid], tmpc, tmp_gwmat, false, true, s, req[conv_v1::kWeight]);
-        } else {
-          // Legacy approach shown here for comparison:
-          //   gwmat[gid] += dot(temp_dst[gid], tmpc.T());
-          linalg_gemm(temp_dst[gid], tmpc, gwmat[gid], false, true, s, kAddTo);
-        }
-      }
-
-      for (uint32_t gid = 0; gid < param_.num_group; ++gid) {
-        Tensor<xpu, 2, DType> tmpc = temp_col.Slice(gstride * gid, gstride * (gid + 1));
-        // Legacy approach shown here for comparison:
-        //   tmpc = dot(wmat[gid].T(), temp_dst[gid]);
-        linalg_gemm(wmat[gid], temp_dst[gid], tmpc, true, false, s);
-      }
-      if (param_.pad[0] == 0 && param_.pad[1] == 0) {
-        Assign(gdata.Slice(i, i + step), req[conv_v1::kData],
-               pack_col2patch(temp_col,
-                              data.Slice(i, i + step).shape_,
-                              param_.kernel[0],
-                              param_.kernel[1],
-                              param_.stride[0],
-                              param_.stride[1],
-                              param_.dilate[0],
-                              param_.dilate[1]));
-      } else {
-        Shape<4> pshape = data.Slice(i, i + step).shape_;
-        pshape[2] += 2 * param_.pad[0];
-        pshape[3] += 2 * param_.pad[1];
-        Assign(gdata.Slice(i, i + step), req[conv_v1::kData],
-               crop(pack_col2patch(temp_col,
-                                   pshape,
-                                   param_.kernel[0],
-                                   param_.kernel[1],
-                                   param_.stride[0],
-                                   param_.stride[1],
-                                   param_.dilate[0],
-                                   param_.dilate[1]),
-                    gdata[i][0].shape_));
-      }
-    }
-    if (!param_.no_bias) {
-      Tensor<xpu, 1, DType> gbias = in_grad[conv_v1::kBias].get<xpu, 1, DType>(s);
-      Assign(gbias, req[conv_v1::kBias], sumall_except_dim<1>(grad));
-    }
-  }
-
- private:
-  inline index_t InitTemp(const mshadow::Shape<4> &ishape,
-                          const mshadow::Shape<4> &oshape) {
-    const int ksize_y = param_.kernel[0];
-    const int ksize_x = param_.kernel[1];
-    shape_colunit_ = mshadow::Shape2(ishape[1] * ksize_y * ksize_x,
-                                     oshape[2] * oshape[3]);
-    shape_dstunit_ = mshadow::Shape3(param_.num_group,
-                                     param_.num_filter / param_.num_group,
-                                     oshape[2] * oshape[3]);
-    // param_.workspace is in elements of sizeof(DType)
-    // if param_.workspace is set to zero the nstep_ equals ishape[0] (batch)
-    nstep_ = std::max<index_t>(
-        std::min<index_t>(param_.workspace /
-          (shape_colunit_.Size() + shape_dstunit_.Size()), ishape[0]),
-      1);
-
-    mshadow::Shape<2> scol = mshadow::Shape2(shape_colunit_[0],
-                                             shape_colunit_[1] * nstep_);
-    mshadow::Shape<3> sdst = mshadow::Shape3(shape_dstunit_[0],
-                                             shape_dstunit_[1],
-                                             shape_dstunit_[2] * nstep_);
-    index_t required_size = scol.Size() + sdst.Size();
-    return required_size;
-  }
-
-  ConvolutionV1Param param_;
-  mshadow::Shape<2> shape_colunit_;
-  mshadow::Shape<3> shape_dstunit_;
-  index_t nstep_;
-};  // class ConvolutionV1Op
-
-template<typename xpu>
-Operator* CreateOp(ConvolutionV1Param param, int dtype,
-                   mxnet::ShapeVector *in_shape,
-                   mxnet::ShapeVector *out_shape,
-                   Context ctx);
-
-#if DMLC_USE_CXX11
-class ConvolutionV1Prop : public OperatorProperty {
- public:
-  std::vector<std::string> ListArguments() const override {
-    if (!param_.no_bias) {
-      return {"data", "weight", "bias"};
-    } else {
-      return {"data", "weight"};
-    }
-  }
-
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    using namespace mshadow;
-    param_.Init(kwargs);
-    if (param_.kernel.ndim() == 2) {
-      param_.layout = param_.layout ? param_.layout.value() : mshadow::kNCHW;
-      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
-      if (param_.dilate.ndim() == 0) param_.dilate = Shape2(1, 1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
-    } else {
-      CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim() << "D convolution not supported";
-      param_.layout = param_.layout ? param_.layout.value(): mshadow::kNCDHW;
-      if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
-      if (param_.dilate.ndim() == 0) param_.dilate = Shape3(1, 1, 1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
-    }
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(mxnet::ShapeVector *in_shape,
-                  mxnet::ShapeVector *out_shape,
-                  mxnet::ShapeVector *aux_shape) const override {
-    using namespace mshadow;
-    if (!param_.no_bias) {
-      CHECK_EQ(in_shape->size(), 3U) << "Input:[data, weight, bias]";
-    } else {
-      CHECK_EQ(in_shape->size(), 2U) << "Input:[data, weight]";
-    }
-    // CHECK_EQ(out_shape->size(), 1) << "Output: [output]";
-    out_shape->resize(1, mxnet::TShape());
-    const mxnet::TShape &dshp = (*in_shape)[conv_v1::kData];
-    if (!mxnet::ndim_is_known(dshp)) return false;
-    if (param_.kernel.ndim() == 2) {
-      // 2d conv_v1
-      CHECK_EQ(dshp.ndim(), 4U) \
-          << "Input data should be 4D in batch-num_filter-y-x";
-      Shape<4> dshape = ConvertLayout(dshp.get<4>(), param_.layout.value(), kNCHW);
-      Shape<4> wshape = Shape4(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
-                               param_.kernel[0], param_.kernel[1]);
-      wshape = ConvertLayout(wshape, kNCHW, param_.layout.value());
-      wshape[0] *= param_.num_group;
-      SHAPE_ASSIGN_CHECK(*in_shape, conv_v1::kWeight, wshape);
-      if (!param_.no_bias) {
-        SHAPE_ASSIGN_CHECK(*in_shape, conv_v1::kBias, Shape1(param_.num_filter));
-      }
-
-      const index_t ksize_y = static_cast<index_t>(param_.kernel[0]);
-      const index_t ksize_x = static_cast<index_t>(param_.kernel[1]);
-      CHECK_EQ(dshape[1] % param_.num_group, 0) \
-          << "input num_filter must divide group size";
-      CHECK_EQ(param_.num_filter % param_.num_group, 0) \
-          << "output num_filter must divide group size";
-      CHECK_GT(param_.kernel.Size(), 0) \
-          << "incorrect kernel size: " << param_.kernel;
-      CHECK_GT(param_.stride.Size(), 0) \
-          << "incorrect stride size: " << param_.stride;
-      CHECK_GT(param_.dilate.Size(), 0) \
-          << "incorrect dilate size: " << param_.dilate;
-      CHECK(ksize_y <= dshape[2] + 2 * param_.pad[0]
-            && ksize_x <= dshape[3] + 2 * param_.pad[1])
-          << "kernel size exceed input";
-      Shape<4> oshape;
-      oshape[0] = dshape[0];
-      oshape[1] = param_.num_filter;
-      oshape[2] = (dshape[2] + 2 * param_.pad[0] -
-          (param_.dilate[0] * (ksize_y - 1) + 1)) / param_.stride[0] + 1;
-      oshape[3] = (dshape[3] + 2 * param_.pad[1] -
-          (param_.dilate[1] * (ksize_x - 1) + 1)) / param_.stride[1] + 1;
-      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCHW, param_.layout.value()));
-      return true;
-    } else if (param_.kernel.ndim() == 3) {
-      // 3d conv_v1
-      CHECK_EQ(dshp.ndim(), 5U) \
-        << "Input data should be 5D in batch-num_filter-depth-y-x";
-      Shape<5> dshape = ConvertLayout(dshp.get<5>(), param_.layout.value(), kNCDHW);
-      Shape<5> wshape = Shape5(param_.num_filter / param_.num_group, dshape[1] / param_.num_group,
-                               param_.kernel[0], param_.kernel[1], param_.kernel[2]);
-      wshape = ConvertLayout(wshape, kNCDHW, param_.layout.value());
-      wshape[0] *= param_.num_group;
-      SHAPE_ASSIGN_CHECK(*in_shape, conv_v1::kWeight, wshape);
-      if (!param_.no_bias) {
-        SHAPE_ASSIGN_CHECK(*in_shape, conv_v1::kBias, Shape1(param_.num_filter));
-      }
-
-      const index_t ksize_d = static_cast<index_t>(param_.kernel[0]);
-      const index_t ksize_y = static_cast<index_t>(param_.kernel[1]);
-      const index_t ksize_x = static_cast<index_t>(param_.kernel[2]);
-      CHECK_EQ(dshape[1] % param_.num_group, 0)
-        << "input num_filter must divide group size";
-      CHECK_EQ(param_.num_filter % param_.num_group, 0)
-        << "output num_filter must divide group size";
-      CHECK_GT(param_.kernel.Size(), 0) \
-        << "incorrect kernel size: " << param_.kernel;
-      CHECK_GT(param_.stride.Size(), 0) \
-        << "incorrect stride size: " << param_.stride;
-      CHECK_GT(param_.dilate.Size(), 0) \
-        << "incorrect dilate size: " << param_.dilate;
-      CHECK(ksize_d <= dshape[2] + 2 * param_.pad[0]
-            && ksize_y <= dshape[3] + 2 * param_.pad[1]
-            && ksize_x <= dshape[4] + 2 * param_.pad[2])
-        << "kernel size exceed input";
-      CHECK_EQ(param_.dilate.Size(), 1U)
-        << "Dilate is not supported in 3d convolution";
-      Shape<5> oshape;
-      oshape[0] = dshape[0];
-      oshape[1] = param_.num_filter;
-      oshape[2] = (dshape[2] + 2 * param_.pad[0] -
-          (1 * (ksize_d - 1) + 1)) / param_.stride[0] + 1;
-      oshape[3] = (dshape[3] + 2 * param_.pad[1] -
-          (1 * (ksize_y - 1) + 1)) / param_.stride[1] + 1;
-      oshape[4] = (dshape[4] + 2 * param_.pad[2] -
-          (1 * (ksize_x - 1) + 1)) / param_.stride[2] + 1;
-      SHAPE_ASSIGN_CHECK(*out_shape, 0, ConvertLayout(oshape, kNCDHW, param_.layout.value()));
-      return true;
-    } else {
-      LOG(FATAL) << "Unknown convolution type";
-      return false;
-    }
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_GE(in_type->size(), 1);
-    int dtype = (*in_type)[0];
-    CHECK_NE(dtype, -1) << "First input must have specified type";
-    for (size_t i = 0; i < in_type->size(); ++i) {
-      if ((*in_type)[i] == -1) {
-        (*in_type)[i] = dtype;
-      } else {
-        UNIFORM_TYPE_CHECK((*in_type)[i], dtype, ListArguments()[i]);
-      }
-    }
-    out_type->clear();
-    out_type->push_back(dtype);
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    auto ptr = new ConvolutionV1Prop();
-    ptr->param_ = param_;
-    return ptr;
-  }
-
-  std::string TypeString() const override {
-    return "Convolution_v1";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {out_grad[conv_v1::kOut], in_data[conv_v1::kData], in_data[conv_v1::kWeight]};
-  }
-
-  std::vector<ResourceRequest> ForwardResource(
-      const mxnet::ShapeVector &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
-
-  std::vector<ResourceRequest> BackwardResource(
-      const mxnet::ShapeVector &in_shape) const override {
-    return {ResourceRequest::kTempSpace};
-  }
-
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented.";
-    return nullptr;
-  }
-
-  Operator* CreateOperatorEx(Context ctx, mxnet::ShapeVector *in_shape,
-                             std::vector<int> *in_type) const override;
-
- private:
-  ConvolutionV1Param param_;
-};  // class ConvolutionV1Prop
-#endif  // DMLC_USE_CXX11
-}  // namespace op
-}  // namespace mxnet
-#endif  // MXNET_OPERATOR_CONVOLUTION_V1_INL_H_
diff --git a/src/operator/convolution_v1.cc b/src/operator/convolution_v1.cc
deleted file mode 100644
index 5d1ce3108a3f..000000000000
--- a/src/operator/convolution_v1.cc
+++ /dev/null
@@ -1,64 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file convolution_v1.cc
- * \brief
- * \author Bing Xu
-*/
-
-#include "./convolution_v1-inl.h"
-namespace mxnet {
-namespace op {
-DMLC_REGISTER_PARAMETER(ConvolutionV1Param);
-
-template<>
-Operator* CreateOp<cpu>(ConvolutionV1Param param, int dtype,
-                        mxnet::ShapeVector *in_shape,
-                        mxnet::ShapeVector *out_shape,
-                        Context ctx) {
-  Operator *op = nullptr;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new ConvolutionV1Op<cpu, DType>(param);
-  })
-  return op;
-}
-
-// DO_BIND_DISPATCH comes from operator_common.h
-Operator *ConvolutionV1Prop::CreateOperatorEx(Context ctx,
-                                              mxnet::ShapeVector *in_shape,
-                                              std::vector<int> *in_type) const {
-  mxnet::ShapeVector out_shape, aux_shape;
-  std::vector<int> out_type, aux_type;
-  CHECK(InferType(in_type, &out_type, &aux_type));
-  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0], in_shape, &out_shape, ctx);
-}
-
-MXNET_REGISTER_OP_PROPERTY(Convolution_v1, ConvolutionV1Prop)
-.add_argument("data", "NDArray-or-Symbol", "Input data to the ConvolutionV1Op.")
-.add_argument("weight", "NDArray-or-Symbol", "Weight matrix.")
-.add_argument("bias", "NDArray-or-Symbol", "Bias parameter.")
-.add_arguments(ConvolutionV1Param::__FIELDS__())
-.describe("This operator is DEPRECATED."
-          " Apply convolution to input then add a bias.");
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/convolution_v1.cu b/src/operator/convolution_v1.cu
deleted file mode 100644
index 0f40c30eeb1d..000000000000
--- a/src/operator/convolution_v1.cu
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file convolution_v1.cu
- * \brief
- * \author Bing Xu
-*/
-
-#include "./convolution_v1-inl.h"
-#include <vector>
-#if MXNET_USE_CUDNN == 1
-#include "./nn/cudnn/cudnn_convolution-inl.h"
-#endif  // MXNET_USE_CUDNN
-
-namespace mxnet {
-namespace op {
-template<>
-Operator* CreateOp<gpu>(ConvolutionV1Param param, int dtype,
-                        mxnet::ShapeVector *in_shape,
-                        mxnet::ShapeVector *out_shape,
-                        Context ctx) {
-  Operator *op = nullptr;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    op = new ConvolutionV1Op<gpu, DType>(param);
-  })
-  return op;
-}
-
-}  // namespace op
-}  // namespace mxnet
-
diff --git a/src/operator/pooling_v1-inl.h b/src/operator/pooling_v1-inl.h
deleted file mode 100644
index 6c7845d9d33b..000000000000
--- a/src/operator/pooling_v1-inl.h
+++ /dev/null
@@ -1,378 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file pooling_v1-inl.h
- * \brief
- * \author Bing Xu
-*/
-
-#ifndef MXNET_OPERATOR_POOLING_V1_INL_H_
-#define MXNET_OPERATOR_POOLING_V1_INL_H_
-
-#include <dmlc/logging.h>
-#include <dmlc/parameter.h>
-#include <mxnet/operator.h>
-#include <algorithm>
-#include <map>
-#include <vector>
-#include <string>
-#include <utility>
-#include "./operator_common.h"
-
-namespace mxnet {
-namespace op {
-
-namespace pool_v1_enum {
-enum PoolingV1OpInputs {kData};
-enum PoolingV1OpOutputs {kOut};
-enum PoolingV1OpType {kMaxPooling, kAvgPooling, kSumPooling};
-enum PoolingV1OpPadConventionType {kValid, kFull};
-}  // namespace pool_v1_enum
-
-struct PoolingV1Param : public dmlc::Parameter<PoolingV1Param> {
-  mxnet::TShape kernel;
-  mxnet::TShape stride;
-  mxnet::TShape pad;
-  int pool_type;
-  int pooling_convention;
-  bool global_pool;
-  DMLC_DECLARE_PARAMETER(PoolingV1Param) {
-    DMLC_DECLARE_FIELD(kernel).set_default(mxnet::TShape(0, -1))
-    .enforce_nonzero()
-    .describe("pooling kernel size: (y, x) or (d, y, x)");
-
-    DMLC_DECLARE_FIELD(pool_type).set_default(pool_v1_enum::kMaxPooling)
-    .add_enum("max", pool_v1_enum::kMaxPooling)
-    .add_enum("avg", pool_v1_enum::kAvgPooling)
-    .add_enum("sum", pool_v1_enum::kSumPooling)
-    .describe("Pooling type to be applied.");
-
-    DMLC_DECLARE_FIELD(global_pool).set_default(false)
-    .describe("Ignore kernel size, do global pooling based on current input feature map. ");
-
-    DMLC_DECLARE_FIELD(pooling_convention).set_default(pool_v1_enum::kValid)
-    .add_enum("full", pool_v1_enum::kFull)
-    .add_enum("valid", pool_v1_enum::kValid)
-    .describe("Pooling convention to be applied.");
-
-    DMLC_DECLARE_FIELD(stride).set_default(mxnet::TShape(0, -1))
-    .enforce_nonzero()
-    .describe("stride: for pooling (y, x) or (d, y, x)");
-
-    DMLC_DECLARE_FIELD(pad).set_default(mxnet::TShape(0, -1))
-    .describe("pad for pooling: (y, x) or (d, y, x)");
-  }
-};
-
-template<typename xpu, typename Reducer, typename DType>
-class PoolingV1Op : public Operator {
- public:
-  explicit PoolingV1Op(PoolingV1Param p) {
-    this->param_ = p;
-  }
-
-  virtual void Forward(const OpContext &ctx,
-                       const std::vector<TBlob> &in_data,
-                       const std::vector<OpReqType> &req,
-                       const std::vector<TBlob> &out_data,
-                       const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    if (param_.kernel.ndim() == 3) {
-      LOG(FATAL) << "3D kernel not implemented";
-    }
-
-    // reset padding size for global pooling
-    mxnet::TShape padding = param_.pad;
-    // mxnet::TShape kernel = param_.kernel;
-    if (param_.global_pool) {
-      padding[0] = padding[1] = 0;
-      // kernel[0] = kernel[1] = 0;
-    }
-
-    Tensor<xpu, 4, DType> data = in_data[pool_v1_enum::kData].get<xpu, 4, DType>(s);
-    Tensor<xpu, 4, DType> out = out_data[pool_v1_enum::kOut].get<xpu, 4, DType>(s);
-    mshadow::Shape<2> out_shape = Shape2(out.shape_[2], out.shape_[3]);
-    if (param_.pool_type == pool_v1_enum::kMaxPooling
-        || param_.pool_type == pool_v1_enum::kSumPooling) {
-      Assign(out,
-             req[pool_v1_enum::kOut],
-             pool<Reducer>(pad(data, padding[0], padding[1]),
-                           out_shape,
-                           param_.global_pool ? data.shape_[2] : param_.kernel[0],
-                           param_.global_pool ? data.shape_[3] : param_.kernel[1],
-                           param_.global_pool ? 1 : param_.stride[0],
-                           param_.global_pool ? 1 : param_.stride[1]));
-    } else if (param_.pool_type == pool_v1_enum::kAvgPooling) {
-      Assign(out,
-             req[pool_v1_enum::kOut],
-             scalar<DType>(1.0f / (param_.global_pool ?
-                      data.shape_[2] * data.shape_[3] :
-                      param_.kernel[0] * param_.kernel[1])) * \
-             pool<Reducer>(pad(data, padding[0], padding[1]),
-                           out_shape,
-                           param_.global_pool ? data.shape_[2] : param_.kernel[0],
-                           param_.global_pool ? data.shape_[3] : param_.kernel[1],
-                           param_.global_pool ? 1 : param_.stride[0],
-                           param_.global_pool ? 1 : param_.stride[1]));
-    }
-  }
-
-  virtual void Backward(const OpContext &ctx,
-                        const std::vector<TBlob> &out_grad,
-                        const std::vector<TBlob> &in_data,
-                        const std::vector<TBlob> &out_data,
-                        const std::vector<OpReqType> &req,
-                        const std::vector<TBlob> &in_grad,
-                        const std::vector<TBlob> &aux_args) {
-    using namespace mshadow;
-    using namespace mshadow::expr;
-    CHECK_EQ(out_grad.size(), 1);
-    CHECK_EQ(in_data.size(), 1);
-    CHECK_EQ(out_data.size(), 1);
-    CHECK_EQ(req.size(), 1);
-    CHECK_EQ(in_grad.size(), 1);
-    // TODO(bing): remove pad (0,0)
-    if (param_.kernel.ndim() == 3) {
-      LOG(FATAL) << "3D kernel not implemented";
-    }
-
-    // reset padding size for global pooling
-    mxnet::TShape padding = param_.pad;
-    if (param_.global_pool) {
-      padding[0] = padding[1] = 0;
-    }
-
-    Stream<xpu> *s = ctx.get_stream<xpu>();
-    Tensor<xpu, 4, DType> grad = out_grad[pool_v1_enum::kOut].get<xpu, 4, DType>(s);
-    Tensor<xpu, 4, DType> data = in_data[pool_v1_enum::kData].get<xpu, 4, DType>(s);
-    Tensor<xpu, 4, DType> output_data = out_data[pool_v1_enum::kOut].get<xpu, 4, DType>(s);
-    Tensor<xpu, 4, DType> input_grad = in_grad[pool_v1_enum::kData].get<xpu, 4, DType>(s);
-
-    mshadow::Shape<2> in_shape = Shape2(data.shape_[2], data.shape_[3]);
-
-    if (param_.pool_type == pool_v1_enum::kMaxPooling
-        || param_.pool_type == pool_v1_enum::kSumPooling) {
-      Assign(input_grad, req[pool_v1_enum::kData],
-             crop(unpool<Reducer>(pad(data, padding[0], padding[1]),
-                                  pad(output_data, 0, 0),
-                                  pad(grad, 0, 0),
-                                  param_.global_pool ? in_shape[0] : param_.kernel[0],
-                                  param_.global_pool ? in_shape[1] : param_.kernel[1],
-                                  param_.global_pool ? 1 : param_.stride[0],
-                                  param_.global_pool ? 1 : param_.stride[1]),
-                  in_shape,
-                  padding[0],
-                  padding[1]));
-    } else if (param_.pool_type == pool_v1_enum::kAvgPooling) {
-      Assign(input_grad, req[pool_v1_enum::kData],
-             scalar<DType>(1.0f / (param_.global_pool ?
-                      data.shape_[2] * data.shape_[3] :
-                      param_.kernel[0] * param_.kernel[1])) * \
-             crop(unpool<Reducer>(pad(data, padding[0], padding[1]),
-                                  pad(output_data, 0, 0),
-                                  pad(grad, 0, 0),
-                                  param_.global_pool ? in_shape[0] : param_.kernel[0],
-                                  param_.global_pool ? in_shape[1] : param_.kernel[1],
-                                  param_.global_pool ? 1 : param_.stride[0],
-                                  param_.global_pool ? 1 : param_.stride[1]),
-                  in_shape,
-                  padding[0],
-                  padding[1]));
-    }
-  }
-
- private:
-  PoolingV1Param param_;
-};  // class PoolingV1Op
-
-template<typename xpu>
-Operator* CreateOp(PoolingV1Param param, int dtype);
-
-
-#if DMLC_USE_CXX11
-class PoolingV1Prop : public OperatorProperty {
- public:
-  void Init(const std::vector<std::pair<std::string, std::string> >& kwargs) override {
-    using namespace mshadow;
-    param_.Init(kwargs);
-    if (param_.kernel.ndim() == 1) {
-      if (param_.stride.ndim() == 0) param_.stride = Shape1(1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape1(0);
-    } else if (param_.kernel.ndim() == 2) {
-      if (param_.stride.ndim() == 0) param_.stride = Shape2(1, 1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape2(0, 0);
-    } else {
-        // ignore kernel size only if global_pool not assigned false
-        if (param_.global_pool == false) {
-          CHECK_EQ(param_.kernel.ndim(), 3U) << param_.kernel.ndim()
-              << "D pooling not supported";
-        }
-      if (param_.stride.ndim() == 0) param_.stride = Shape3(1, 1, 1);
-      if (param_.pad.ndim() == 0) param_.pad = Shape3(0, 0, 0);
-    }
-  }
-
-  std::map<std::string, std::string> GetParams() const override {
-    return param_.__DICT__();
-  }
-
-  bool InferShape(mxnet::ShapeVector *in_shape,
-                  mxnet::ShapeVector *out_shape,
-                  mxnet::ShapeVector *aux_shape) const override {
-    CHECK_EQ(in_shape->size(), 1U);
-    const mxnet::TShape &dshape = (*in_shape)[0];
-    CHECK_GE(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x) "
-                               << "Or 5D in (batch, channel, d, y, x)";
-    CHECK_LE(dshape.ndim(), 5) << "Pooling: Input data should be 4D in (batch, channel, y, x) "
-                               << "Or 5D in (batch, channel, d, y, x)";
-    mxnet::TShape oshape = dshape;
-    if (dshape.ndim() ==  -1) return false;
-    if (param_.global_pool) {
-      if (dshape.ndim() == 4) {
-        oshape[2] = 1;
-        oshape[3] = 1;
-      } else {
-        oshape[2] = 1;
-        oshape[3] = 1;
-        oshape[4] = 1;
-      }
-      out_shape->clear();
-      out_shape->push_back(oshape);
-    } else if (param_.kernel.ndim() == 2) {
-      CHECK_EQ(dshape.ndim(), 4) << "Pooling: Input data should be 4D in (batch, channel, y, x)";
-      CHECK(param_.kernel[0] <= dshape[2] + 2 * param_.pad[0])
-          << "kernel size (" << param_.kernel[0] << ") exceeds input (" << dshape[2]
-          << " padded to " << (dshape[2] + 2*param_.pad[0]) << ")";
-      CHECK(param_.kernel[1] <= dshape[3] + 2 * param_.pad[1])
-          << "kernel size (" << param_.kernel[1] << ") exceeds input (" << dshape[3]
-          << " padded to " << (dshape[3] + 2*param_.pad[1]) << ")";
-      if (param_.pooling_convention == pool_v1_enum::kValid) {
-          oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
-                              param_.stride[0];
-          oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) /
-                              param_.stride[1];
-      } else {
-          oshape[2] = 1 + static_cast<int>(std::ceil(static_cast<float>(
-                              dshape[2] + 2 * param_.pad[0] -
-                              param_.kernel[0]) / param_.stride[0]));
-          oshape[3] = 1 + static_cast<int>(std::ceil(static_cast<float>(
-                              dshape[3] + 2 * param_.pad[1] -
-                              param_.kernel[1]) / param_.stride[1]));
-      }
-
-      out_shape->clear();
-      out_shape->push_back(oshape);
-    } else if (param_.kernel.ndim() == 3) {
-      CHECK_EQ(dshape.ndim(), 5) << "Pooling: Input data should be 5D in (batch, channel, d, y, x)";
-      CHECK_LE(param_.kernel[0], dshape[2] + 2 * param_.pad[0]) << "kernel size exceeds input";
-      CHECK_LE(param_.kernel[1], dshape[3] + 2 * param_.pad[1]) << "kernel size exceeds input";
-      CHECK_LE(param_.kernel[2], dshape[4] + 2 * param_.pad[2]) << "kernel size exceeds input";
-      if (param_.pooling_convention == pool_v1_enum::kValid) {
-          oshape[2] = 1 + (dshape[2] + 2 * param_.pad[0] - param_.kernel[0]) /
-                              param_.stride[0];
-          oshape[3] = 1 + (dshape[3] + 2 * param_.pad[1] - param_.kernel[1]) /
-                              param_.stride[1];
-          oshape[4] = 1 + (dshape[4] + 2 * param_.pad[2] - param_.kernel[2]) /
-                              param_.stride[2];
-      } else {
-          oshape[2] = 1 + static_cast<int>(std::ceil(static_cast<float>(
-                              dshape[2] + 2 * param_.pad[0] -
-                              param_.kernel[0]) / param_.stride[0]));
-          oshape[3] = 1 + static_cast<int>(std::ceil(static_cast<float>(
-                              dshape[3] + 2 * param_.pad[1] -
-                              param_.kernel[1]) / param_.stride[1]));
-          oshape[4] = 1 + static_cast<int>(std::ceil(static_cast<float>(
-                              dshape[4] + 2 * param_.pad[2] -
-                              param_.kernel[2]) / param_.stride[2]));
-      }
-
-      out_shape->clear();
-      out_shape->push_back(oshape);
-    }
-    return true;
-  }
-
-  bool InferType(std::vector<int> *in_type,
-                 std::vector<int> *out_type,
-                 std::vector<int> *aux_type) const override {
-    CHECK_EQ(in_type->size(), 1);
-    int dtype = (*in_type)[0];
-
-    if (dtype == -1) {
-      LOG(FATAL) << "Input type to pooling is not specified.";
-      return false;
-    }
-
-    out_type->clear();
-    out_type->push_back(dtype);
-    return true;
-  }
-
-  OperatorProperty* Copy() const override {
-    PoolingV1Prop *prop_sym = new PoolingV1Prop();
-    prop_sym->param_ = this->param_;
-    return prop_sym;
-  }
-
-  std::string TypeString() const override {
-    return "Pooling_v1";
-  }
-
-  std::vector<int> DeclareBackwardDependency(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data) const override {
-    return {out_grad[pool_v1_enum::kOut], in_data[pool_v1_enum::kData],
-            out_data[pool_v1_enum::kOut]};
-  }
-
-  std::vector<std::pair<int, void*> > BackwardInplaceOption(
-    const std::vector<int> &out_grad,
-    const std::vector<int> &in_data,
-    const std::vector<int> &out_data,
-    const std::vector<void*> &in_grad) const override {
-#if MXNET_USE_CUDNN == 1
-    return {};
-#else
-    return {{in_data[pool_v1_enum::kData], in_grad[pool_v1_enum::kData]}};
-#endif
-  }
-
-  Operator* CreateOperator(Context ctx) const override {
-    LOG(FATAL) << "Not Implemented.";
-    return nullptr;
-  }
-
-  Operator* CreateOperatorEx(Context ctx, mxnet::ShapeVector *in_shape,
-                             std::vector<int> *in_type) const override;
-
- private:
-  PoolingV1Param param_;
-};  // class PoolingV1Prop
-#endif  // DMLC_USE_CXX11
-}  // namespace op
-}  // namespace mxnet
-
-#endif  // MXNET_OPERATOR_POOLING_V1_INL_H_
diff --git a/src/operator/pooling_v1.cc b/src/operator/pooling_v1.cc
deleted file mode 100644
index 9e350e88c9ee..000000000000
--- a/src/operator/pooling_v1.cc
+++ /dev/null
@@ -1,109 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file pooling_v1.cc
- * \brief
- * \author Bing Xu
-*/
-#include "./pooling_v1-inl.h"
-
-namespace mxnet {
-namespace op {
-
-template<>
-Operator *CreateOp<cpu>(PoolingV1Param param, int dtype) {
-  Operator *op = nullptr;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    switch (param.pool_type) {
-      case pool_v1_enum::kMaxPooling:
-        op = new PoolingV1Op<cpu, mshadow::red::maximum, DType>(param);
-        break;
-      case pool_v1_enum::kAvgPooling:
-        op = new PoolingV1Op<cpu, mshadow::red::sum, DType>(param);
-        break;
-      case pool_v1_enum::kSumPooling:
-        op = new PoolingV1Op<cpu, mshadow::red::sum, DType>(param);
-        break;
-      default:
-        LOG(FATAL) << "unknown pooling type";
-        return nullptr;
-    }
-  })
-
-  return op;
-}
-
-// DO_BIND_DISPATCH comes from operator_common.h
-Operator* PoolingV1Prop::CreateOperatorEx(Context ctx, mxnet::ShapeVector *in_shape,
-                                     std::vector<int> *in_type) const {
-  mxnet::ShapeVector out_shape, aux_shape;
-  std::vector<int> out_type, aux_type;
-  CHECK(InferType(in_type, &out_type, &aux_type));
-  CHECK(InferShape(in_shape, &out_shape, &aux_shape));
-  DO_BIND_DISPATCH(CreateOp, param_, (*in_type)[0]);
-}
-
-DMLC_REGISTER_PARAMETER(PoolingV1Param);
-
-MXNET_REGISTER_OP_PROPERTY(Pooling_v1, PoolingV1Prop)
-.describe(R"code(This operator is DEPRECATED.
-Perform pooling on the input.
-
-The shapes for 2-D pooling is
-
-- **data**: *(batch_size, channel, height, width)*
-- **out**: *(batch_size, num_filter, out_height, out_width)*, with::
-
-    out_height = f(height, kernel[0], pad[0], stride[0])
-    out_width = f(width, kernel[1], pad[1], stride[1])
-
-The definition of *f* depends on ``pooling_convention``, which has two options:
-
-- **valid** (default)::
-
-    f(x, k, p, s) = floor((x+2*p-k)/s)+1
-
-- **full**, which is compatible with Caffe::
-
-    f(x, k, p, s) = ceil((x+2*p-k)/s)+1
-
-But ``global_pool`` is set to be true, then do a global pooling, namely reset
-``kernel=(height, width)``.
-
-Three pooling options are supported by ``pool_type``:
-
-- **avg**: average pooling
-- **max**: max pooling
-- **sum**: sum pooling
-
-1-D pooling is special case of 2-D pooling with *weight=1* and
-*kernel[1]=1*.
-
-For 3-D pooling, an additional *depth* dimension is added before
-*height*. Namely the input data will have shape *(batch_size, channel, depth,
-height, width)*.
-
-)code" ADD_FILELINE)
-.add_argument("data", "NDArray-or-Symbol", "Input data to the pooling operator.")
-.add_arguments(PoolingV1Param::__FIELDS__());
-
-}  // namespace op
-}  // namespace mxnet
diff --git a/src/operator/pooling_v1.cu b/src/operator/pooling_v1.cu
deleted file mode 100644
index f648a7c1f909..000000000000
--- a/src/operator/pooling_v1.cu
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * Copyright (c) 2015 by Contributors
- * \file pooling_v1.cu
- * \brief
- * \author Bing Xu
-*/
-#include <vector>
-#include "./pooling_v1-inl.h"
-
-namespace mxnet {
-namespace op {
-template<>
-Operator *CreateOp<gpu>(PoolingV1Param param, int dtype) {
-  Operator *op = nullptr;
-  MSHADOW_REAL_TYPE_SWITCH(dtype, DType, {
-    switch (param.pool_type) {
-      case pool_v1_enum::kMaxPooling:
-        op = new PoolingV1Op<gpu, mshadow::red::maximum, DType>(param);
-        break;
-      case pool_v1_enum::kAvgPooling:
-        op = new PoolingV1Op<gpu, mshadow::red::sum, DType>(param);
-        break;
-      case pool_v1_enum::kSumPooling:
-        op = new PoolingV1Op<gpu, mshadow::red::sum, DType>(param);
-        break;
-      default:
-        LOG(FATAL) << "unknown pooling type";
-        return nullptr;
-    }
-  });
-  return op;
-}
-
-}  // namespace op
-}  // namespace mxnet
-
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 519c02f141e9..37e0eedafc05 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -727,16 +727,12 @@ def test_2D_with_width(width, grad_req):
 def test_convolution_versions():
     # 2D convolution NCHW
     ctx_list = [{'ctx': mx.cpu(0), 'conv_data': (2, 2, 7, 7), 'type_dict': {'conv_data': np.float32}},
-                {'ctx': mx.gpu(0), 'conv_data': (2, 2, 7, 7), 'type_dict': {'conv_data': np.float32}},
-                {'ctx': mx.gpu(0), 'conv_data': (2, 2, 7, 7), 'type_dict': {'conv_data': np.float32}},
                 {'ctx': mx.cpu(0), 'conv_data': (2, 2, 7, 7), 'type_dict': {'conv_data': np.float32}},
                 {'ctx': mx.gpu(0), 'conv_data': (2, 2, 7, 7), 'type_dict': {'conv_data': np.float32}}]
-    conv_v1_cpu = mx.sym.Convolution_v1(num_filter=3, kernel=(3,3), pad=(1,1), name='conv')
-    conv_v1_gpu = mx.sym.Convolution_v1(num_filter=3, kernel=(3,3), pad=(1,1), cudnn_off=True, name='conv')
     conv_cudnn = mx.sym.Convolution(num_filter=3, kernel=(3,3), pad=(1,1), name='conv')
     conv_cpu = mx.sym.Convolution(num_filter=3, kernel=(3,3), pad=(1,1), name='conv')
     conv_gpu = mx.sym.Convolution(num_filter=3, kernel=(3,3), pad=(1,1), cudnn_off=True, name='conv')
-    syms = [conv_v1_cpu, conv_v1_gpu, conv_cudnn, conv_cpu, conv_gpu]
+    syms = [conv_cudnn, conv_cpu, conv_gpu]
     check_consistency(syms, ctx_list)
 
     # 3D convolution NCDHW
@@ -1043,30 +1039,27 @@ def test_pooling_versions_helper(pool_op_list, data, kernel, pool_type, pad, str
                 if not is_default_stride(stride) or random_choice():
                     pool_op_args.update({'stride' : stride})
 
-            expected_pool_ops = ['pool', 'pool_transposed', 'pool_v1']
-            if pool_op == 'pool_v1':
-                sym = mx.sym.Pooling_v1(**pool_op_args)
+            expected_pool_ops = ['pool', 'pool_transposed']
+            pool_op_args.update({'p_value' : p_value, 'count_include_pad' : count_include_pad})
+            if ctx_type != 'cpu':
+                pool_op_args['cudnn_off'] = ctx_type == 'gpu'
+            if pool_op == 'pool':
+                # isolate pooling input from symbol input to test shared tensor optimizations
+                buffered_input = mx.sym.identity(name='pool')
+                sym = mx.sym.Pooling(buffered_input, **pool_op_args)
+            elif pool_op == 'pool_transposed':
+                ndim = len(data)
+                # NCW->NWC axes=(0,2,1) NCHW->NHWC axes=(0,2,3,1) NCDHW->NDHWC axes=(0,2,3,4,1);
+                axes = (0,) + tuple(range(2,ndim)) + (1,)
+                transposed = mx.sym.transpose(axes=axes, name='pool')
+                pooled = mx.sym.Pooling(data=transposed, layout=transposed_layout(ndim),
+                                        **pool_op_args)
+                # NWC->NCW axes=(0,2,1) NHWC->NCHW axes=(0,3,1,2) NDHWC->NCDHW axes=(0,4,1,2,3);
+                axes = (0, ndim-1) + tuple(range(1,ndim-1))
+                sym = mx.sym.transpose(data=pooled, axes=axes, name='pool')
             else:
-                pool_op_args.update({'p_value' : p_value, 'count_include_pad' : count_include_pad})
-                if ctx_type != 'cpu':
-                    pool_op_args['cudnn_off'] = ctx_type == 'gpu'
-                if pool_op == 'pool':
-                    # isolate pooling input from symbol input to test shared tensor optimizations
-                    buffered_input = mx.sym.identity(name='pool')
-                    sym = mx.sym.Pooling(buffered_input, **pool_op_args)
-                elif pool_op == 'pool_transposed':
-                    ndim = len(data)
-                    # NCW->NWC axes=(0,2,1) NCHW->NHWC axes=(0,2,3,1) NCDHW->NDHWC axes=(0,2,3,4,1);
-                    axes = (0,) + tuple(range(2,ndim)) + (1,)
-                    transposed = mx.sym.transpose(axes=axes, name='pool')
-                    pooled = mx.sym.Pooling(data=transposed, layout=transposed_layout(ndim),
-                                            **pool_op_args)
-                    # NWC->NCW axes=(0,2,1) NHWC->NCHW axes=(0,3,1,2) NDHWC->NCDHW axes=(0,4,1,2,3);
-                    axes = (0, ndim-1) + tuple(range(1,ndim-1))
-                    sym = mx.sym.transpose(data=pooled, axes=axes, name='pool')
-                else:
-                    raise RuntimeError('Expected one of {}, saw {}.'.format(expected_pool_ops,
-                                                                            pool_op))
+                raise RuntimeError('Expected one of {}, saw {}.'.format(expected_pool_ops,
+                                                                        pool_op))
             sym_list.append(sym)
 
         check_consistency(sym_list, ctx_list, equal_nan=(not count_include_pad), rtol=tol, atol=tol)
@@ -1128,10 +1121,6 @@ def test_pooling_dim(dim, pool_type, dtype, pool_op_list, p_value=2, count_inclu
     std_pool_op_list = ['pool_cpu', 'pool_transposed_cpu',
                         'pool_gpu', 'pool_transposed_gpu',
                         'pool_cudnn', 'pool_transposed_cudnn']
-    # The implementations of the 'v1' pooling operator
-    v1_pool_op_list = ['pool_v1_cpu', 'pool_v1_gpu']
-    # For those cases when all implementations should match- the combined implementation list.
-    combo_pool_op_list = std_pool_op_list + v1_pool_op_list
 
     for dtype in [np.float32, np.float64, np.float16]:
         # Testing of the standard (not 'v1') pooling operator is universal across all
@@ -1145,47 +1134,6 @@ def test_pooling_dim(dim, pool_type, dtype, pool_op_list, p_value=2, count_inclu
             test_pooling_dim(dim, 'lp', dtype, std_pool_op_list, p_value=2)
             test_pooling_dim(dim, 'lp', dtype, std_pool_op_list, p_value=3)
 
-        # Testing of the 'v1' pooling operator is over its restricted support domain of
-        # 2D data only and not with the 'lp' pooling type.  The 'v1' cpu and gpu versions are
-        # always tested against each other, and sometimes against the standard operator versions.
-        # The slightly different 'v1' definition prevents this in the following cases:
-        #
-        #     1. In max pooling, when multiple input values are the maximum in the input window,
-        #        the 'v1' implementation backprops the gradient to all maxima, whereas the standard
-        #        pooling operator backprops the gradient to the lowest-indexed maximum only.
-        #     2. In max pooling, the 'v1' operator pads with 0's and this value can become the
-        #        maximum output value in the case of an all-negative input.  The standard pooling
-        #        operator effectively considers the padding to be the largest negative value, so
-        #        only input values should appear in the output.
-        #     3. In avg pooling, the 'v1' operator divides the sum by the same window size factor,
-        #        even at the edges, and so does not support count_include_pad = False.
-        #     4. The float16 'v1' pooling operator performs forward sums and averages in
-        #        float16, whereas the std operators perform those calculations in float32, so
-        #        greater float16 tolerances are needed when comparing across implementations.
-
-        # Double the float16 tol when comparing v1 and non-v1 implemenations, per note 4 above.
-        relaxed_tol = {np.dtype(np.float16): 2e-1,
-               np.dtype(np.float32): 1e-3,
-               np.dtype(np.float64): 1e-5,
-               np.dtype(np.uint8): 0,
-               np.dtype(np.int32): 0,
-               np.dtype(np.int64): 0}
-
-        # Exclude std implementations due to points 1 and 2 above.
-        test_pooling_dim('2D', 'max', dtype, v1_pool_op_list)
-        # The standard and 'v1' implementations match for this case.
-        test_pooling_dim('2D', 'avg', dtype, combo_pool_op_list, count_include_pad=True,
-                         tol=relaxed_tol)
-        # Exclude std implementations due to point 3 above.
-        test_pooling_dim('2D', 'avg', dtype, v1_pool_op_list, count_include_pad=False)
-        # The standard and 'v1' implementations match for this case.
-        test_pooling_dim('2D', 'sum', dtype, combo_pool_op_list, tol=relaxed_tol)
-
-    # We can compare the standard and 'v1' max pooling implementations if we eliminate padding
-    # (see point 2 above) and use np.float64 data so that no two random input window values are
-    # likely to be the same (see point 1 above).
-    test_pooling_dim('2D_no_padding', 'max', np.float64, combo_pool_op_list)
-
 
 @with_seed()
 def test_pooling_full_2d():
@@ -1318,19 +1266,6 @@ def test_2d_pooling(pool_type, p_value=2):
 
         pooling_convention = 'valid'
 
-        if pool_type != 'lp':
-            ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
-            sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
-                                              pooling_convention=pooling_convention, global_pool=True, name='pool'))
-
-            ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
-            sym_list.append(mx.sym.Pooling_v1(kernel=kernel, pool_type=pool_type,
-                                              pooling_convention=pooling_convention, global_pool=True, name='pool'))
-
-            ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
-            sym_list.append(mx.sym.Pooling_v1(pool_type=pool_type,
-                                              pooling_convention=pooling_convention, global_pool=True, name='pool'))
-
         ctx_list.append({'ctx': mx.cpu(0), 'pool_data': data, 'type_dict': {'pool_data': np.float32}})
         sym_list.append(mx.sym.Pooling(kernel=kernel, pad=pad, stride=stride, pool_type=pool_type,
                                        pooling_convention=pooling_convention, global_pool=True, p_value=p_value, name='pool'))
diff --git a/tests/python/unittest/test_dlpack.py b/tests/python/unittest/test_dlpack.py
deleted file mode 100644
index 46bdde7d0bcd..000000000000
--- a/tests/python/unittest/test_dlpack.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Licensed to the Apache Software Foundation (ASF) under one
-# or more contributor license agreements.  See the NOTICE file
-# distributed with this work for additional information
-# regarding copyright ownership.  The ASF licenses this file
-# to you under the Apache License, Version 2.0 (the
-# "License"); you may not use this file except in compliance
-# with the License.  You may obtain a copy of the License at
-#
-#   http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing,
-# software distributed under the License is distributed on an
-# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, either express or implied.  See the License for the
-# specific language governing permissions and limitations
-# under the License.
-
-import ctypes
-import mxnet as mx
-from mxnet.base import NDArrayHandle, _LIB, c_str, check_call
-from mxnet.test_utils import assert_almost_equal
-
-def test_from_dlpack_backward_compatibility():
-    def from_dlpack_old(dlpack):
-
-        PyCapsuleDestructor = ctypes.CFUNCTYPE(None, ctypes.c_void_p)
-        _c_str_dltensor = c_str('dltensor')
-        _c_str_used_dltensor = c_str('used_dltensor')
-        handle = NDArrayHandle()
-        dlpack = ctypes.py_object(dlpack)
-        assert ctypes.pythonapi.PyCapsule_IsValid(dlpack, _c_str_dltensor), ValueError(
-            'Invalid DLPack Tensor. DLTensor capsules can be consumed only once.')
-        dlpack_handle = ctypes.c_void_p(ctypes.pythonapi.PyCapsule_GetPointer(dlpack, _c_str_dltensor))
-        check_call(_LIB.MXNDArrayFromDLPack(dlpack_handle, ctypes.byref(handle)))
-        # Rename PyCapsule (DLPack)
-        ctypes.pythonapi.PyCapsule_SetName(dlpack, _c_str_used_dltensor)
-        # delete the deleter of the old dlpack
-        ctypes.pythonapi.PyCapsule_SetDestructor(dlpack, None)
-        return mx.nd.NDArray(handle=handle)
-
-    x = mx.nd.ones((2,3))
-    y = mx.nd.to_dlpack_for_read(x)
-    z = from_dlpack_old(y)
-    assert_almost_equal(x.asnumpy(), z.asnumpy(), rtol=1e-5, atol=1e-5)
-
diff --git a/tests/python/unittest/test_gluon_data_vision.py b/tests/python/unittest/test_gluon_data_vision.py
index 320b33d3e28a..eddd77152f5a 100644
--- a/tests/python/unittest/test_gluon_data_vision.py
+++ b/tests/python/unittest/test_gluon_data_vision.py
@@ -29,6 +29,7 @@
     xfail_when_nonstandard_decimal_separator
 
 import numpy as np
+import pytest
 
 @with_seed()
 def test_to_tensor():
@@ -381,17 +382,19 @@ def test_random_rotation():
 def test_random_transforms():
     from mxnet.gluon.data.vision import transforms
 
-    tmp_t = transforms.Compose([transforms.Resize(300), transforms.RandomResizedCrop(224)])
-    transform = transforms.Compose([transforms.RandomApply(tmp_t, 0.5)])
+    counter = 0
+    def transform_fn(x):
+        nonlocal counter
+        counter += 1
+        return x
+    transform = transforms.Compose([transforms.RandomApply(transform_fn, 0.5)])
 
-    img = mx.nd.ones((10, 10, 3), dtype='uint8')
-    iteration = 1000
+    img = mx.np.ones((10, 10, 3), dtype='uint8')
+    iteration = 10000
     num_apply = 0
     for _ in range(iteration):
         out = transform(img)
-        if out.shape[0] == 224:
-            num_apply += 1
-    assert_almost_equal(num_apply/float(iteration), 0.5, 0.1)
+    assert counter == pytest.approx(5000, 1e-1)
 
 @xfail_when_nonstandard_decimal_separator
 @with_seed()
diff --git a/tests/python/unittest/test_numpy_gluon_data_vision.py b/tests/python/unittest/test_numpy_gluon_data_vision.py
index ec82052eff50..deb61de0b38a 100644
--- a/tests/python/unittest/test_numpy_gluon_data_vision.py
+++ b/tests/python/unittest/test_numpy_gluon_data_vision.py
@@ -32,6 +32,7 @@
 from mxnet.base import MXNetError
 from mxnet.gluon.data.vision import transforms
 from mxnet import image
+import pytest
 
 @with_seed()
 @use_np
@@ -341,16 +342,19 @@ def test_random_transforms():
     from mxnet.gluon.data.vision import transforms
 
     tmp_t = transforms.Compose([transforms.Resize(300), transforms.RandomResizedCrop(224)])
-    transform = transforms.Compose([transforms.RandomApply(tmp_t, 0.5)])
+    counter = 0
+    def transform_fn(x):
+        nonlocal counter
+        counter += 1
+        return x
+    transform = transforms.Compose([transforms.RandomApply(transform_fn, 0.5)])
 
     img = mx.np.ones((10, 10, 3), dtype='uint8')
-    iteration = 1000
+    iteration = 10000
     num_apply = 0
     for _ in range(iteration):
         out = transform(img)
-        if out.shape[0] == 224:
-            num_apply += 1
-    assert_almost_equal(num_apply/float(iteration), 0.5, 0.1)
+    assert counter == pytest.approx(5000, 1e-1)
 
 @xfail_when_nonstandard_decimal_separator
 @with_seed()