apache · szha · Aug 23, 2020 · Aug 20, 2020 · Aug 20, 2020
diff --git a/benchmark/opperf/utils/profiler_utils.py b/benchmark/opperf/utils/profiler_utils.py
@@ -117,7 +117,7 @@ def parse_profiler_dump(operator_name, profiler_dump):
     MXNDArrayFree                          49           1.1220           0.0170           0.0360           0.0229
     MXAutogradBackwardEx                   50          11.5460           0.1980           0.3360           0.2309
     MXNet C API Calls                     399           1.9990           1.6010           1.9990           0.1990
-    MXImperativeInvokeEx                   50           4.4810           0.0700           0.1330           0.0896
+    MXImperativeInvoke                     50           4.4810           0.0700           0.1330           0.0896
     MXNDArrayWaitAll                       50         769.0570          14.0200          24.5030          15.3811
     MXAutogradSetIsTraining               100           0.0190           0.0000           0.0010           0.0002
     MXAutogradSetIsRecording              100           0.0400           0.0000           0.0010           0.0004

@@ -50,12 +50,12 @@ for MXNet users to do multi-threaded inference.
  * \brief create cached operator, allows to choose thread_safe version
  * of cachedop
  */
-MXNET_DLL int MXCreateCachedOpEX(SymbolHandle handle,
-                                 int num_flags,
-                                 const char** keys,
-                                 const char** vals,
-                                 CachedOpHandle *out,
-                                 bool thread_safe DEFAULT(false));
+MXNET_DLL int MXCreateCachedOp(SymbolHandle handle,
+                               int num_flags,
+                               const char** keys,
+                               const char** vals,
+                               CachedOpHandle *out,
+                               bool thread_safe DEFAULT(false));
 ```
 
 ## Multithreaded inference in MXNet with C API and CPP Package
@@ -135,8 +135,8 @@ The above code loads params and copies input data and params to specific context
 [https://github.com/apache/incubator-mxnet/example/multi_threaded_inference/multi_threaded_inference.cc#L207-L233](multi_threaded_inference.cc#L207-233)
 
 The above code prepares `flag_key_cstrs` and `flag_val_cstrs` to be passed the Cached op.
-The C API call is made with `MXCreateCachedOpEX`. This will lead to creation of thread safe cached
-op since the `thread_safe` (which is the last parameter to `MXCreateCachedOpEX`) is set to
+The C API call is made with `MXCreateCachedOp`. This will lead to creation of thread safe cached
+op since the `thread_safe` (which is the last parameter to `MXCreateCachedOp`) is set to
 true. When this is set to false, it will invoke CachedOp instead of CachedOpThreadSafe.
 
 
@@ -146,7 +146,7 @@ true. When this is set to false, it will invoke CachedOp instead of CachedOpThre
 
 The above creates the lambda function taking the thread number as the argument.
 If `random_sleep` is set it will sleep for a random number (secs) generated between 0 to 5 seconds.
-Following this, it invokes `MXInvokeCachedOpEx`(from the hdl it determines whether to invoke cached op threadsafe version or not).
+Following this, it invokes `MXInvokeCachedOp`(from the hdl it determines whether to invoke cached op threadsafe version or not).
 When this is set to false, it will invoke CachedOp instead of CachedOpThreadSafe.
 
 ### Step 5: Spawn multiple threads and wait for all threads to complete
@@ -179,7 +179,7 @@ The above code outputs results for different threads and cleans up the thread sa
 6. Bulking of ops is not supported.
 7. This only supports inference use cases currently, training use cases are not supported.
 8. Graph rewrites with subgraph API currently not supported.
-9. There is currently no frontend API support to run multi threaded inference. Users can use CreateCachedOpEX and InvokeCachedOp in combination with
+9. There is currently no frontend API support to run multi threaded inference. Users can use CreateCachedOp and InvokeCachedOp in combination with
 the CPP frontend to run multi-threaded inference as of today.
 10. Multi threaded inference with threaded engine with Module/Symbolic API and C Predict API are not currently supported.
 11. Exception thrown with `wait_to_read` in individual threads can cause issues. Calling invoke from each thread and calling WaitAll after thread joins should still work fine.

@@ -130,11 +130,11 @@ MXNET_C_API
 =================
 Name                          Total Count        Time (ms)    Min Time (ms)    Max Time (ms)    Avg Time (ms)
 ----                          -----------        ---------    -------------    -------------    -------------
-MXImperativeInvokeEx                    2           0.3360           0.0990           0.2370           0.1680
+MXImperativeInvoke                      2           0.3360           0.0990           0.2370           0.1680
 MXNet C API Calls                      17           0.2320           0.2160           0.2320           0.0080
 MXNDArraySyncCopyFromCPU                1           0.1750           0.1750           0.1750           0.1750
-MXNDArrayCreateEx                       1           0.1050           0.1050           0.1050           0.1050
-MXNDArrayGetShapeEx                    11           0.0210           0.0000           0.0160           0.0019
+MXNDArrayCreate                         1           0.1050           0.1050           0.1050           0.1050
+MXNDArrayGetShape                      11           0.0210           0.0000           0.0160           0.0019
 MXNDArrayWaitAll                        1           0.0200           0.0200           0.0200           0.0200
 MXNDArrayGetDType                       1           0.0010           0.0010           0.0010           0.0010
 MXNet C API Concurrency                34           0.0000           0.0000           0.0010           0.0000
@@ -157,8 +157,8 @@ The profiling data has captured info about interesting functions that have execu
 
 |**Function Name**	|**Description**	|
 |---	|---	|
-|**MXImperativeInvokeEx**	| invokes an operator to perform the computation |
-|**MXNDArrayCreateEx**	| creates  an ndarray	|
+|**MXImperativeInvoke**	| invokes an operator to perform the computation |
+|**MXNDArrayCreate**	| creates  an ndarray	|
 | **MXNDArrayGetDType**	| returns  the data type of the ndarray |
 | **MXNDArrayGetShape**	| returns  the shape of the ndarray (as a tuple where each element is the size of a  dimension) |
 | **MXNDArraySyncCopyFromCPU** | called when data is initially residing outside of an MXNet data structure (ie.  numpy.ndarry rather than mxnet.numpy.ndarray). Data is copied into the MXNet  data structure   |
@@ -201,7 +201,7 @@ In the following list, #1 uses regular numpy functions to initialize data. MXNet
 ![dev_guide_profilling_3.png](/assets/img/dev_guide_profilling_3.png)
 Here, the four red arrows show the important events in this sequence.
 
-1. First, the `MXNDArrayCreateEx` is called to physically  allocate space to store the data and other necessary attributes in the `ndarray` class.
+1. First, the `MXNDArrayCreate` is called to physically  allocate space to store the data and other necessary attributes in the `ndarray` class.
 2. Then some support functions are called (`MXNDArrayGetShape,` `MXNDArrayGetDType`) while initialing the data structure.
 3. Finally the data is copied from the non-MXNet ndarray into the newly prepared MXNet ndarray by the `MXNDArraySyncCopyFromCPU`  function.
 
@@ -210,9 +210,9 @@ Next, #3 (in our code example) begins the computing process to produce our outpu
 ![dev_guide_profilling_4.png](/assets/img/dev_guide_profilling_4.png)
 Here you can see that the following sequence of events happen:
 
-1. `MXImperativeInvokeEx` is called the first time to launch the diagonal operator from #3 (in our code example).
+1. `MXImperativeInvoke` is called the first time to launch the diagonal operator from #3 (in our code example).
 2. Soon after that the actual **`diag`**  operator begins executing in another thread.
-3. While that is happening, our main thread moves on and calls `MXImperativeInvokeEx` again to launch the **`sum`**  operator. Just like before, this returns without actually executing the operator  and continues.
+3. While that is happening, our main thread moves on and calls `MXImperativeInvoke` again to launch the **`sum`**  operator. Just like before, this returns without actually executing the operator  and continues.
 4. Lastly, the `MXNDArrayWaitAll` is called as the main thread has progressed to #4 in our app. It will wait here while all the  computation finishes.
 
 Next lets look at a view of the part of the timeline zoomed to the actual operator execution.
@@ -274,6 +274,6 @@ The first red box is the first run, and the 2nd smaller one is the 2nd run. Firs
 
 
 ![dev_guide_profilling_7.png](/assets/img/dev_guide_profilling_7.png)
-We still have the same sequence of events at the beginning to initialize the MXNet ndarray (`MXNDArrayCreateEx`, `MXNDArrayGetShape`, `MXNDArrayGetDType`, `MXNDArraySyncCopyFromCPU`). Then the **`diag`** operator runs, followed by the **`sum`** operator, and finally the `waitall`. When you look at this, be careful about the assumptions that you make. In this version of the timeline, it appears that the operator executes after the `MXImperativeInvokeEx` runs, and seems to imply an inherent ordering. But realize that there is no dependency between the **`diag`** operator finishing and the next **`MXImperativeInvokeEx`** launching the **`sum`** operator. In this case, it just-so-happens that the **`diag`** operator finishes so quickly that it appears that way. But in reality the main thread is launching the operators and not waiting for them to finish. Lastly, keep in mind that in this case by the time we hit the **`MXNDArrayWaitAll`** everything is already done and we return immediately, but in other circumstances it may sit here waiting for everything to finish (like we saw earlier in the first run). 
+We still have the same sequence of events at the beginning to initialize the MXNet ndarray (`MXNDArrayCreate`, `MXNDArrayGetShape`, `MXNDArrayGetDType`, `MXNDArraySyncCopyFromCPU`). Then the **`diag`** operator runs, followed by the **`sum`** operator, and finally the `waitall`. When you look at this, be careful about the assumptions that you make. In this version of the timeline, it appears that the operator executes after the `MXImperativeInvoke` runs, and seems to imply an inherent ordering. But realize that there is no dependency between the **`diag`** operator finishing and the next **`MXImperativeInvoke`** launching the **`sum`** operator. In this case, it just-so-happens that the **`diag`** operator finishes so quickly that it appears that way. But in reality the main thread is launching the operators and not waiting for them to finish. Lastly, keep in mind that in this case by the time we hit the **`MXNDArrayWaitAll`** everything is already done and we return immediately, but in other circumstances it may sit here waiting for everything to finish (like we saw earlier in the first run). 
 
 
@@ -226,9 +226,9 @@ void run_inference(const std::string& model_name, const std::vector<mxnet::cpp::
     flag_val_cstrs.emplace_back(flag_vals[i].c_str());
   }
 
-  int ret1 = MXCreateCachedOpEX(out.GetHandle(), flag_keys.size(),
-                                flag_key_cstrs.data(), flag_val_cstrs.data(),
-                                &hdl, true);
+  int ret1 = MXCreateCachedOp(out.GetHandle(), flag_keys.size(),
+                              flag_key_cstrs.data(), flag_val_cstrs.data(),
+                              &hdl, true);
   if (ret1 < 0) {
     LOG(FATAL) << MXGetLastError();
   }
@@ -256,8 +256,8 @@ void run_inference(const std::string& model_name, const std::vector<mxnet::cpp::
     }
     int num_output = 0;
     const int *stypes;
-    int ret = MXInvokeCachedOpEx(hdl, arr_handles[num].size(), arr_handles[num].data(),
-                                 cpu::kDevMask, 0, &num_output, &(cached_op_handles[num]), &stypes);
+    int ret = MXInvokeCachedOp(hdl, arr_handles[num].size(), arr_handles[num].data(),
+                               cpu::kDevMask, 0, &num_output, &(cached_op_handles[num]), &stypes);
     if (ret < 0) {
       LOG(FATAL) << MXGetLastError();
     }