From 7358f5f491cf75a48cf73574d5642916fb0c49c8 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Thu, 13 Feb 2020 07:49:35 +0000
Subject: [PATCH 01/19] initial commit

---
 example/extensions/lib_subgraph/README.md | 120 ++++++++++++++++++++++
 1 file changed, 120 insertions(+)
 create mode 100644 example/extensions/lib_subgraph/README.md

diff --git a/example/extensions/lib_subgraph/README.md b/example/extensions/lib_subgraph/README.md
new file mode 100644
index 000000000000..bebeb68b0f85
--- /dev/null
+++ b/example/extensions/lib_subgraph/README.md
@@ -0,0 +1,120 @@
+<!--- Licensed to the Apache Software Foundation (ASF) under one -->
+<!--- or more contributor license agreements.  See the NOTICE file -->
+<!--- distributed with this work for additional information -->
+<!--- regarding copyright ownership.  The ASF licenses this file -->
+<!--- to you under the Apache License, Version 2.0 (the -->
+<!--- "License"); you may not use this file except in compliance -->
+<!--- with the License.  You may obtain a copy of the License at -->
+
+<!---   http://www.apache.org/licenses/LICENSE-2.0 -->
+
+<!--- Unless required by applicable law or agreed to in writing, -->
+<!--- software distributed under the License is distributed on an -->
+<!--- "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -->
+<!--- KIND, either express or implied.  See the License for the -->
+<!--- specific language governing permissions and limitations -->
+<!--- under the License. -->
+
+Custom Partitioner Example and Tutorial
+=============================
+
+## Introduction
+
+Adding custom model partitioners in MXNet used to require understanding of MXNet backend operator registration and recompiling of MXNet with all its dependencies. So our approach for adding custom partitioners is to enable dynamic loading of C++ partitioning code compiled in external libraries at runtime.
+
+Custom partitioners enable users to write custom model partitioning strategies without compiling against all of MXNet header files and dependencies. When a library containing custom partitioners is loaded dynamically, the components found in the library will be re-registered in MXNet so that users can use those natively just like other built-in components.
+
+## Getting Started
+
+### Have MXNet Ready
+
+First you should install MXNet either from compiling from source code or download from nightly build. It doesn’t matter if the build comes with CUDA or MKLDNN. The custom partitioning APIs do not interact with the execution of other native MXNet operators.
+
+### Run An Example:
+
+You can start getting familiar with custom partitioners by running an example provided in the **example/extensions/lib_subgraph** directory. This example partitions `exp` and `log` operators into subgraphs. Go to `lib_subgraph` directory and follow these steps:
+
+1. Run `make`. The Makefile will generate a dynamic library ** libsubgraph_lib.so** compiled from `subgraph_lib.cc`. This is the library you are going to load that contains everything for the custom partitioner.
+2. Run `python test_subgraph.py`. It’ll first load the above .so library, find the components, register them in the MXNet backend, print "Found x", then partition the model and execute the operators like a regular MXNet operator and output the result.
+
+### Basic Files For Gemm Library:
+
+* ** lib_subgraph/subgraph_lib.cc**: This file has a source code implementation of all required components of a custom partitioner, as well as the registration of the custom components.
+
+* ** lib_subgraph/Makefile**: Compile source code to a dynamic shared library, with a header file `include/mxnet/lib_api.h` from MXNet source code. Currently the custom operator is compatible with C++11 onwards.
+
+* ** lib_subgraph/test_subgraph.py**: This file calls `mx.library.load(‘libsubgraph_lib.so’)` to load the library containing the custom components, partitions the model using the `optimize_for` API, and prints outputs of the forward passes. The outputs should be the same as the regular MXNet forward pass without partitioning.
+
+## Writing Custom Partitioner Library:
+
+For building a library containing your own custom partitioner, compose a C++ source file like `mypart_lib.cc`, include `lib_api.h` header file, and write your custom operator implementation with these essential functions:
+- `initialize` - Library Initialization Function
+- `REGISTER_PARTITIONER ` - Partitioner Registration Macro
+- `mySupportedOps ` - Operator Support
+
+Then compile it to `mypart_lib.so` dynamic library using the following command:
+```bash
+g++ -shared -fPIC -std=c++11 mypart_lib.cc -o libmypart_lib.so -I ../../../include/mxnet
+```
+
+Finally, you can write a Python script to load the library and partition a model with your custom partitioner:
+```python
+import mxnet as mx
+mx.library.load(‘libmyop_lib.so’)
+sym, _, _ = mx.model.load_checkpoint('mymodel', 0) 
+sym.optimize_for("myPart")
+```
+
+### Writing A Custom Partitioner:
+
+There are several essential building blocks for making a custom partitioner:
+
+* [initialize](./subgraph_lib.cc#L242):
+    * This function is the library initialization function necessary for any dynamic libraries. It lets you check if the user is using a compatible version of MXNet. Note that this `version` parameter is passed from MXNet when library is loaded.
+
+            MXReturnValue initialize(int version)
+
+* [supportedOps](./subgraph_lib.cc#L179):
+    * This function provides a copy of the model graph as a JSON string, and provides an interface for identifying which operators should be partitioned into a subgraph. Also this is where a custom partitioner can validate the options specified by the user.
+
+            MXReturnValue supportedOps(
+                std::string json,
+                const int num_ids,
+                int *ids,
+                std::unordered_map<std::string, 
+                                   std::string>& options)
+
+* [REGISTER_PARTITIONER(my_part_name)](./subgraph_lib.cc#L238):
+    * This macro registers the custom partitioner and its properties to MXNet by its name. Notice that a partitioner can have multiple partitioning strategies. This enables multiple *passes* to be run in a single partitioning call from the user. The first argument to `addStrategy` is a user-specified name. The second argument is the `supportedOps` function. The third argument is the name of the subgraph operator to create for each subgraph created during partitioning (see below for more info about subgraph operators). The `setAcceptSubgraph` API registers a callback function that is called for each subgraph created during partitioning (more on this below). Notice that the first argument to this function is the strategy to associate with and the second argument is the `acceptSubgraph`.
+
+            REGISTER_PARTITIONER(my_part_name)
+            .addStrategy("strategy1", 
+                          supportedOps, 
+                          "_custom_subgraph_op")
+            .setAcceptSubgraph("strategy1", 
+                                acceptSubgraph);
+
+
+Also there are some optional functions you can specify:
+
+* [acceptSubgraph](./subgraph_lib.cc#L220):
+    * This function provides an opportunity to accept/reject a subgraph after MXNet partitions it. It also allows specifying custom attributes on the subgraph (ie. user-generated IDs). If you do not register this function, subgraphs will be accepted by default. Only implement this function to set attributes or conditionally reject subgraphs. 
+
+            MXReturnValue acceptSubgraph(
+                std::string json,
+                int subraph_id,
+                bool* accept,
+                std::unordered_map<std::string, 
+                                   std::string>& options,
+                std::unordered_map<std::string, 
+                                   std::string>& attrs)
+
+Let’s take a closer look at those registry functions:
+
+* **supportedOps**: This function takes four arguments. The 1st argument is a JSON string of the model architecture graph, where nodes are inputs/params/weights and edges are data dependencies. The graph is pre-sorted in topological order. When traversing the graph, operators to be partitioned into subgraphs are identified and an entry is set to `1` for the node ID in the `ids` array. Users can pass custom options to the partitioner and they are passed to the function in the `options` map. 
+
+* **acceptSubgraph**: This function takes five arguments. The 1st argument is a JSON string of the newly partitioned subgraph. It can be analyzed and accepted/rejected by setting `true`/`false` for the `accept` input. The `options` map is the same one passed to the `supportedOps` API. The `attrs` map provides an API to add user-specified attributes to the subgraph. These attributes will be available at runtime when the subgraph is executed and provides a way to pass info from partitioning-time to runtime. 
+
+### Writing A Custom Subgraph Operator:
+
+A partitioning strategy specifies how to partition a model and isolate operators into subgraphs. In MXNet, subgraphs are just a [stateful operator](../lib_custom_op#writing-stateful-custom-operator). Subgraph operators have an extra attribute called `SUBGRAPH_SYM_JSON` that maps to a JSON string of the subgraph. The expectation is that when a subgraph operator executes a forward/backward call, it executes all of the operators in the subgraph. 

From 08bddfcff928a85d4573ea7c5182c76543f23c5c Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Thu, 13 Feb 2020 07:55:21 +0000
Subject: [PATCH 02/19] fixed title header, removed GeMM reference

---
 example/extensions/lib_subgraph/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/example/extensions/lib_subgraph/README.md b/example/extensions/lib_subgraph/README.md
index bebeb68b0f85..249e512f2bf6 100644
--- a/example/extensions/lib_subgraph/README.md
+++ b/example/extensions/lib_subgraph/README.md
@@ -16,7 +16,7 @@
 <!--- under the License. -->
 
 Custom Partitioner Example and Tutorial
-=============================
+=======================================
 
 ## Introduction
 
@@ -37,7 +37,7 @@ You can start getting familiar with custom partitioners by running an example pr
 1. Run `make`. The Makefile will generate a dynamic library ** libsubgraph_lib.so** compiled from `subgraph_lib.cc`. This is the library you are going to load that contains everything for the custom partitioner.
 2. Run `python test_subgraph.py`. It’ll first load the above .so library, find the components, register them in the MXNet backend, print "Found x", then partition the model and execute the operators like a regular MXNet operator and output the result.
 
-### Basic Files For Gemm Library:
+### Basic Files For Custom Partitioner Library:
 
 * ** lib_subgraph/subgraph_lib.cc**: This file has a source code implementation of all required components of a custom partitioner, as well as the registration of the custom components.
 

From 324df69cc684b8f87b164d81b164573d7d7cc802 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Sun, 16 Feb 2020 09:23:09 +0000
Subject: [PATCH 03/19] added example for gluon partitioning, fixed subgraphOp
 gradient

---
 .../extensions/lib_subgraph/subgraph_lib.cc   |  1 +
 .../extensions/lib_subgraph/test_subgraph.py  | 14 ++++
 src/c_api/c_api.cc                            | 69 ++++++++++++++-----
 3 files changed, 66 insertions(+), 18 deletions(-)

diff --git a/example/extensions/lib_subgraph/subgraph_lib.cc b/example/extensions/lib_subgraph/subgraph_lib.cc
index 0727eb786ad8..8f339159fbc5 100644
--- a/example/extensions/lib_subgraph/subgraph_lib.cc
+++ b/example/extensions/lib_subgraph/subgraph_lib.cc
@@ -232,6 +232,7 @@ MXReturnValue myAcceptSubgraph(std::string json, int subraph_id, bool* accept,
     std::cout << "accepting subgraph" << std::endl;
     attrs["myKey"] = "myVal";
   }
+  std::cout << json << std::endl;
   return MX_SUCCESS;
 }
 
diff --git a/example/extensions/lib_subgraph/test_subgraph.py b/example/extensions/lib_subgraph/test_subgraph.py
index 8169261d4d42..1bcecae3e21b 100644
--- a/example/extensions/lib_subgraph/test_subgraph.py
+++ b/example/extensions/lib_subgraph/test_subgraph.py
@@ -74,3 +74,17 @@
 exe3 = mysym3.bind(ctx=mx.cpu(), args={'a':mx.nd.ones((3,2)), 'b':mx.nd.ones((3,2))})
 out3 = exe3.forward()
 print(out3)
+
+from mxnet.gluon import nn
+from mxnet import nd
+
+# Gluon Hybridize partitioning with shapes/types
+print('-------------------------------')
+print('Testing Gluon Hybridize partitioning with shapes/types')
+inputs = [a,b]
+sym_block = nn.SymbolBlock(sym, inputs)
+sym_block.initialize()
+sym_block.hybridize(backend='myProp')
+out4 = sym_block(mx.nd.ones((3,2)),mx.nd.ones((3,2)))
+print(out4)
+
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 962bb3b6c06e..1c86e79d1122 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -346,8 +346,10 @@ int MXLoadLib(const char *path) {
     } else {
       CHECK(createop_map.size() != 0) << "Error loading '" << name
                             << "' custom subgraph op, CreateOpState function was not set.";
-    }
+    }    
     LOG(INFO) << "\tOp[" << i << "] " << name;
+    if(isSubgraphOp)
+      LOG(INFO) << "\t\tisSubgraphOp";
     std::string name_str(name);
 
     /*
@@ -581,19 +583,37 @@ int MXLoadLib(const char *path) {
 
     // FGradient register lambda
     auto grad_reg = [=](const nnvm::ObjectPtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
-        // copy gradients first
-        std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
-        // copy inputs second
-        for (auto& h : n->inputs) {
-          heads.push_back(h);
-        }
-        // copy outputs last
-        uint32_t n_out = n->num_outputs();
-        for (uint32_t i = 0; i < n_out; ++i) {
-          heads.emplace_back(n, i, 0);
-        }
-        std::string grad_name = "_backward_" + name_str;
-        return mxnet::op::MakeGradNode(grad_name.c_str(), n, heads, n->attrs.dict);
+      auto p = nnvm::Node::Create();
+      std::string grad_name = "_backward_" + name_str;
+      p->attrs.op = nnvm::Op::Get(grad_name.c_str());
+      p->attrs.name = n->attrs.name + "_backward";
+      p->attrs.dict = n->attrs.dict;
+
+      for(auto s : n->attrs.subgraphs)
+        p->attrs.subgraphs.push_back(s);
+
+      p->control_deps.emplace_back(n);
+      if (p->op()->attr_parser != nullptr) {
+        p->op()->attr_parser(&(p->attrs));
+      }
+      // copy gradients first
+      std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
+      // copy inputs second
+      for (auto& h : n->inputs) {
+        heads.push_back(h);
+      }
+      // copy outputs last
+      uint32_t n_out = n->num_outputs();
+      for (uint32_t i = 0; i < n_out; ++i) {
+        heads.emplace_back(n, i, 0);
+      }
+      p->inputs = heads;
+      CHECK_EQ(p->num_inputs(), p->inputs.size())
+      << "Number of inputs to operator " << grad_name << " (" << p->num_inputs()
+      << ") does not match the actual number of inputs provided to operator "
+      << p->attrs.name << " (" << p->inputs.size() << ").";
+
+      return mxnet::op::CreateNodeEntries(p);
     };
 
     auto resc_req = [=](const NodeAttrs& attrs) {
@@ -726,15 +746,28 @@ int MXLoadLib(const char *path) {
     }
     // optionally add fgradient if user specified a function
     if (backward_ctx_map.size() != 0 || createop_map.size() != 0) {
-      regOp.set_attr<nnvm::FGradient>("FGradient", grad_reg, plevel);
       std::string grad_name = "_backward_" + name_str;
       nnvm::Op &gradOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(grad_name);
+      regOp.set_attr<nnvm::FGradient>("FGradient", grad_reg, plevel);
       gradOp.set_attr<nnvm::TIsBackward>("TIsBackward", true, plevel);
-      gradOp.set_attr_parser(attr_parser);
-      gradOp.set_num_inputs(num_inouts);
-      gradOp.set_num_outputs(num_inputs);
       gradOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type, plevel);
       gradOp.set_attr<FResourceRequest>("FResourceRequest", resc_req, plevel);
+
+      if(!isSubgraphOp) {
+        gradOp.set_attr_parser(attr_parser);
+        gradOp.set_num_inputs(num_inouts);
+        gradOp.set_num_outputs(num_inputs);
+      } else {
+        using namespace mxnet::op;
+        auto grad_inouts = [=](const nnvm::NodeAttrs& attrs) {
+          uint32_t cnt = DefaultSubgraphOpNumInputs(attrs);
+          cnt += 2 * DefaultSubgraphOpNumOutputs(attrs);
+          return cnt;
+        };
+        gradOp.set_num_inputs(grad_inouts);
+        gradOp.set_num_outputs(DefaultSubgraphOpNumInputs);
+      }
+      
       if (createop_map.size() != 0) {
         gradOp.set_attr<bool>("TIsLayerOpBackward", true, plevel);
         auto fstate_backward = [=](const OpStatePtr& state_ptr,

From eaf80d0dbadd09c9f48133f8efacf74b8e1f7905 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Sun, 16 Feb 2020 09:36:14 +0000
Subject: [PATCH 04/19] added comments

---
 src/c_api/c_api.cc | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 1c86e79d1122..b51d9959962a 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -346,7 +346,7 @@ int MXLoadLib(const char *path) {
     } else {
       CHECK(createop_map.size() != 0) << "Error loading '" << name
                             << "' custom subgraph op, CreateOpState function was not set.";
-    }    
+    }
     LOG(INFO) << "\tOp[" << i << "] " << name;
     if(isSubgraphOp)
       LOG(INFO) << "\t\tisSubgraphOp";
@@ -583,36 +583,38 @@ int MXLoadLib(const char *path) {
 
     // FGradient register lambda
     auto grad_reg = [=](const nnvm::ObjectPtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
+      // create node for gradient
       auto p = nnvm::Node::Create();
       std::string grad_name = "_backward_" + name_str;
       p->attrs.op = nnvm::Op::Get(grad_name.c_str());
       p->attrs.name = n->attrs.name + "_backward";
+      // copy attributes and subgraphs
       p->attrs.dict = n->attrs.dict;
-
       for(auto s : n->attrs.subgraphs)
         p->attrs.subgraphs.push_back(s);
-
+      // set control dependency and attr parser
       p->control_deps.emplace_back(n);
       if (p->op()->attr_parser != nullptr) {
         p->op()->attr_parser(&(p->attrs));
       }
-      // copy gradients first
+      // gradient inputs: copy gradients first
       std::vector<nnvm::NodeEntry> heads(ograds.begin(), ograds.end());
       // copy inputs second
       for (auto& h : n->inputs) {
         heads.push_back(h);
       }
-      // copy outputs last
+      // gradient inputs: copy outputs last
       uint32_t n_out = n->num_outputs();
       for (uint32_t i = 0; i < n_out; ++i) {
         heads.emplace_back(n, i, 0);
       }
+      // set inputs to gradient node
       p->inputs = heads;
       CHECK_EQ(p->num_inputs(), p->inputs.size())
       << "Number of inputs to operator " << grad_name << " (" << p->num_inputs()
       << ") does not match the actual number of inputs provided to operator "
       << p->attrs.name << " (" << p->inputs.size() << ").";
-
+      // create output node entries
       return mxnet::op::CreateNodeEntries(p);
     };
 
@@ -744,7 +746,7 @@ int MXLoadLib(const char *path) {
         regOp.set_attr<FComputeEx>("FComputeEx<gpu>", forward_gpu_lambda, plevel);
       }
     }
-    // optionally add fgradient if user specified a function
+    // optionally add fgradient if user specified a function, or for stateful ops
     if (backward_ctx_map.size() != 0 || createop_map.size() != 0) {
       std::string grad_name = "_backward_" + name_str;
       nnvm::Op &gradOp = dmlc::Registry<nnvm::Op>::Get()->__REGISTER_OR_GET__(grad_name);
@@ -754,10 +756,12 @@ int MXLoadLib(const char *path) {
       gradOp.set_attr<FResourceRequest>("FResourceRequest", resc_req, plevel);
 
       if(!isSubgraphOp) {
+        // register attr parser and standard functions for non-subgraph ops
         gradOp.set_attr_parser(attr_parser);
         gradOp.set_num_inputs(num_inouts);
         gradOp.set_num_outputs(num_inputs);
       } else {
+        // for subgraph ops use special functions
         using namespace mxnet::op;
         auto grad_inouts = [=](const nnvm::NodeAttrs& attrs) {
           uint32_t cnt = DefaultSubgraphOpNumInputs(attrs);

From 38b4ee72b1330747efea6300175f8f2199b4ddcf Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Sun, 16 Feb 2020 20:07:24 +0000
Subject: [PATCH 05/19] cleaned up code, added more explanations to doc

---
 example/extensions/lib_subgraph/README.md | 78 +++++++++++++++++------
 src/c_api/c_api.cc                        |  9 ++-
 2 files changed, 64 insertions(+), 23 deletions(-)

diff --git a/example/extensions/lib_subgraph/README.md b/example/extensions/lib_subgraph/README.md
index 249e512f2bf6..5b7b39a0bdf3 100644
--- a/example/extensions/lib_subgraph/README.md
+++ b/example/extensions/lib_subgraph/README.md
@@ -20,7 +20,7 @@ Custom Partitioner Example and Tutorial
 
 ## Introduction
 
-Adding custom model partitioners in MXNet used to require understanding of MXNet backend operator registration and recompiling of MXNet with all its dependencies. So our approach for adding custom partitioners is to enable dynamic loading of C++ partitioning code compiled in external libraries at runtime.
+Adding custom model partitioners in MXNet used to require deep understanding of the MXNet backend, including operator registration and, followed by recompiling MXNet from source with all of its dependencies. This feature allows adding custom partitioners by dynamically loading custom C++ partitioners compiled in external libraries at runtime.
 
 Custom partitioners enable users to write custom model partitioning strategies without compiling against all of MXNet header files and dependencies. When a library containing custom partitioners is loaded dynamically, the components found in the library will be re-registered in MXNet so that users can use those natively just like other built-in components.
 
@@ -28,31 +28,31 @@ Custom partitioners enable users to write custom model partitioning strategies w
 
 ### Have MXNet Ready
 
-First you should install MXNet either from compiling from source code or download from nightly build. It doesn’t matter if the build comes with CUDA or MKLDNN. The custom partitioning APIs do not interact with the execution of other native MXNet operators.
+First you should install MXNet either from compiling from source code or downloading a nightly build. It doesn’t matter if the build comes with CUDA or MKLDNN. The custom partitioning APIs do not interact with the execution of other native MXNet operators.
 
-### Run An Example:
+### Run An Example
 
-You can start getting familiar with custom partitioners by running an example provided in the **example/extensions/lib_subgraph** directory. This example partitions `exp` and `log` operators into subgraphs. Go to `lib_subgraph` directory and follow these steps:
+You can start getting familiar with custom partitioners by running an example provided in the **example/extensions/lib_subgraph** directory. This example partitions `exp` and `log` operators into subgraphs. Go to the `lib_subgraph` directory and follow these steps:
 
-1. Run `make`. The Makefile will generate a dynamic library ** libsubgraph_lib.so** compiled from `subgraph_lib.cc`. This is the library you are going to load that contains everything for the custom partitioner.
-2. Run `python test_subgraph.py`. It’ll first load the above .so library, find the components, register them in the MXNet backend, print "Found x", then partition the model and execute the operators like a regular MXNet operator and output the result.
+1. Run `make`. The Makefile will generate a dynamic library **libsubgraph_lib.so** compiled from `subgraph_lib.cc`. This is the library you are going to load that contains everything for the custom partitioner.
+2. Run `python test_subgraph.py`. It’ll first load the above library, find the components, register them in the MXNet backend, print "Found x", then partition the model and execute the operators like a regular MXNet operator and output the result.
 
-### Basic Files For Custom Partitioner Library:
+### Basic Files For Custom Partitioner Library
 
-* ** lib_subgraph/subgraph_lib.cc**: This file has a source code implementation of all required components of a custom partitioner, as well as the registration of the custom components.
+* **lib_subgraph/subgraph_lib.cc**: This file has a source code implementation of all required components to make a custom partitioner, it also shows registration of them so that they can be loaded by MXNet.
 
-* ** lib_subgraph/Makefile**: Compile source code to a dynamic shared library, with a header file `include/mxnet/lib_api.h` from MXNet source code. Currently the custom operator is compatible with C++11 onwards.
+* **lib_subgraph/Makefile**: This file compiles the source code to a dynamic shared library, with a header file `include/mxnet/lib_api.h` from MXNet source code. Currently the custom operator is compatible with C++11 onwards.
 
-* ** lib_subgraph/test_subgraph.py**: This file calls `mx.library.load(‘libsubgraph_lib.so’)` to load the library containing the custom components, partitions the model using the `optimize_for` API, and prints outputs of the forward passes. The outputs should be the same as the regular MXNet forward pass without partitioning.
+* **lib_subgraph/test_subgraph.py**: This file calls `mx.library.load(‘libsubgraph_lib.so’)` to load the library containing the custom components, partitions the model using the `optimize_for` API, and prints outputs of the forward passes. The outputs should be the same as the regular MXNet forward pass without partitioning.
 
-## Writing Custom Partitioner Library:
+## Writing Custom Partitioner Library
 
-For building a library containing your own custom partitioner, compose a C++ source file like `mypart_lib.cc`, include `lib_api.h` header file, and write your custom operator implementation with these essential functions:
+For building a library containing your own custom partitioner, compose a C++ source file like `mypart_lib.cc`, include `lib_api.h` header file, and write your custom partitioner with these essential functions:
 - `initialize` - Library Initialization Function
 - `REGISTER_PARTITIONER ` - Partitioner Registration Macro
 - `mySupportedOps ` - Operator Support
 
-Then compile it to `mypart_lib.so` dynamic library using the following command:
+Then compile it to the `mypart_lib.so` dynamic library using the following command:
 ```bash
 g++ -shared -fPIC -std=c++11 mypart_lib.cc -o libmypart_lib.so -I ../../../include/mxnet
 ```
@@ -62,10 +62,16 @@ Finally, you can write a Python script to load the library and partition a model
 import mxnet as mx
 mx.library.load(‘libmyop_lib.so’)
 sym, _, _ = mx.model.load_checkpoint('mymodel', 0) 
-sym.optimize_for("myPart")
+
+# Symbol/Module flow
+sym2 = sym.optimize_for("myPart")
+
+# Gluon flow
+sym_block = nn.SymbolBlock(sym, inputs)
+sym_block.hybridize(backend='myPart')
 ```
 
-### Writing A Custom Partitioner:
+### Writing A Custom Partitioner
 
 There are several essential building blocks for making a custom partitioner:
 
@@ -85,7 +91,7 @@ There are several essential building blocks for making a custom partitioner:
                                    std::string>& options)
 
 * [REGISTER_PARTITIONER(my_part_name)](./subgraph_lib.cc#L238):
-    * This macro registers the custom partitioner and its properties to MXNet by its name. Notice that a partitioner can have multiple partitioning strategies. This enables multiple *passes* to be run in a single partitioning call from the user. The first argument to `addStrategy` is a user-specified name. The second argument is the `supportedOps` function. The third argument is the name of the subgraph operator to create for each subgraph created during partitioning (see below for more info about subgraph operators). The `setAcceptSubgraph` API registers a callback function that is called for each subgraph created during partitioning (more on this below). Notice that the first argument to this function is the strategy to associate with and the second argument is the `acceptSubgraph`.
+    * This macro registers the custom partitioner and its properties to MXNet by its name. Notice that a partitioner can have multiple partitioning strategies. This enables multiple *passes* to be run in a single partitioning call from the user. The first argument to `addStrategy` is a user-specified name. The second argument is the `supportedOps` function. The third argument is the name of the subgraph operator to create for each subgraph created during partitioning (see below for more info about subgraph operators). The `setAcceptSubgraph` API registers a callback function that is called for each subgraph created during partitioning (more on this below). Notice that the first argument to this function is the strategy to associate with and the second argument is the `acceptSubgraph` function.
 
             REGISTER_PARTITIONER(my_part_name)
             .addStrategy("strategy1", 
@@ -98,7 +104,7 @@ There are several essential building blocks for making a custom partitioner:
 Also there are some optional functions you can specify:
 
 * [acceptSubgraph](./subgraph_lib.cc#L220):
-    * This function provides an opportunity to accept/reject a subgraph after MXNet partitions it. It also allows specifying custom attributes on the subgraph (ie. user-generated IDs). If you do not register this function, subgraphs will be accepted by default. Only implement this function to set attributes or conditionally reject subgraphs. 
+    * This function provides an opportunity to accept/reject a subgraph after MXNet partitions it. It also allows specifying custom attributes on the subgraph (ie. user-generated IDs). If you do not register this function, subgraphs will be accepted by default. 
 
             MXReturnValue acceptSubgraph(
                 std::string json,
@@ -115,6 +121,42 @@ Let’s take a closer look at those registry functions:
 
 * **acceptSubgraph**: This function takes five arguments. The 1st argument is a JSON string of the newly partitioned subgraph. It can be analyzed and accepted/rejected by setting `true`/`false` for the `accept` input. The `options` map is the same one passed to the `supportedOps` API. The `attrs` map provides an API to add user-specified attributes to the subgraph. These attributes will be available at runtime when the subgraph is executed and provides a way to pass info from partitioning-time to runtime. 
 
-### Writing A Custom Subgraph Operator:
+### Writing A Custom Subgraph Operator
 
 A partitioning strategy specifies how to partition a model and isolate operators into subgraphs. In MXNet, subgraphs are just a [stateful operator](../lib_custom_op#writing-stateful-custom-operator). Subgraph operators have an extra attribute called `SUBGRAPH_SYM_JSON` that maps to a JSON string of the subgraph. The expectation is that when a subgraph operator executes a forward/backward call, it executes all of the operators in the subgraph. 
+
+When registering a custom subgraph operator, all thats needed is to register a `createOpState` function and to set that the operator is a subgraph operator by calling the `setIsSubgraphOp` API like:
+```
+            REGISTER_OP(my_subgraph_op)
+            .setIsSubgraphOp()
+            .setCreateOpState(createOpState, "cpu");
+
+### Parsing a JSON string
+
+To simplify custom partitioner libraries, basic JSON parsing utility functions have been implemented in the `lib_api.h` header file. You create a `JsonParser` object and parse the string by calling the `parse_to_json` API like:
+```c++
+JsonParser parser;
+JsonVal json_val = parser.parse_to_json(json_string);
+```
+
+A `JsonVal` is a class that represents the nodes in a JSON structure. You can check the type of a node (num, str, list, or map) by comparing the `JsonVal.type` to `STR`, `NUM`, `LIST`, or `MAP`. Then you can get that value from the node like:
+```c++
+switch(json_val.type) {
+  case STR:
+    std::string str = json_val.str;
+    break;
+  case NUM:
+    int num = json_val.num;
+    break;
+  case LIST:
+    std::vector<JsonVal> list = json_val.list;
+    break;
+  case MAP:
+    std::map<JsonVal, JsonVal> map = json_val.map;
+    break;
+  default:
+    // error
+}
+```
+
+There are also convenience constructors for creating `JsonVal` objects for strings and numbers like `JsonVal("myKey")` or `JsonVal(42)`. This makes it easy to get specific keys from a map like `json_val.map[JsonVal("nodes")]`.
\ No newline at end of file
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index b51d9959962a..370785f74860 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -348,8 +348,7 @@ int MXLoadLib(const char *path) {
                             << "' custom subgraph op, CreateOpState function was not set.";
     }
     LOG(INFO) << "\tOp[" << i << "] " << name;
-    if(isSubgraphOp)
-      LOG(INFO) << "\t\tisSubgraphOp";
+    if (isSubgraphOp) LOG(INFO) << "\t\tisSubgraphOp";
     std::string name_str(name);
 
     /*
@@ -590,7 +589,7 @@ int MXLoadLib(const char *path) {
       p->attrs.name = n->attrs.name + "_backward";
       // copy attributes and subgraphs
       p->attrs.dict = n->attrs.dict;
-      for(auto s : n->attrs.subgraphs)
+      for (auto s : n->attrs.subgraphs)
         p->attrs.subgraphs.push_back(s);
       // set control dependency and attr parser
       p->control_deps.emplace_back(n);
@@ -755,7 +754,7 @@ int MXLoadLib(const char *path) {
       gradOp.set_attr<FInferStorageType>("FInferStorageType", infer_storage_type, plevel);
       gradOp.set_attr<FResourceRequest>("FResourceRequest", resc_req, plevel);
 
-      if(!isSubgraphOp) {
+      if (!isSubgraphOp) {
         // register attr parser and standard functions for non-subgraph ops
         gradOp.set_attr_parser(attr_parser);
         gradOp.set_num_inputs(num_inouts);
@@ -771,7 +770,7 @@ int MXLoadLib(const char *path) {
         gradOp.set_num_inputs(grad_inouts);
         gradOp.set_num_outputs(DefaultSubgraphOpNumInputs);
       }
-      
+
       if (createop_map.size() != 0) {
         gradOp.set_attr<bool>("TIsLayerOpBackward", true, plevel);
         auto fstate_backward = [=](const OpStatePtr& state_ptr,

From a73c0075f72320d74fc641a5d86219c9fd65baf2 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Sun, 16 Feb 2020 20:09:27 +0000
Subject: [PATCH 06/19] fixed code block

---
 example/extensions/lib_subgraph/README.md | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/example/extensions/lib_subgraph/README.md b/example/extensions/lib_subgraph/README.md
index 5b7b39a0bdf3..a77df5a3f30a 100644
--- a/example/extensions/lib_subgraph/README.md
+++ b/example/extensions/lib_subgraph/README.md
@@ -127,9 +127,10 @@ A partitioning strategy specifies how to partition a model and isolate operators
 
 When registering a custom subgraph operator, all thats needed is to register a `createOpState` function and to set that the operator is a subgraph operator by calling the `setIsSubgraphOp` API like:
 ```
-            REGISTER_OP(my_subgraph_op)
-            .setIsSubgraphOp()
-            .setCreateOpState(createOpState, "cpu");
+REGISTER_OP(my_subgraph_op)
+.setIsSubgraphOp()
+.setCreateOpState(createOpState, "cpu");
+```
 
 ### Parsing a JSON string
 

From 6c2529d19b8112d8084bc4d8a39d156a38026d64 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Tue, 18 Feb 2020 11:06:13 +0000
Subject: [PATCH 07/19] fixed docs, changed argument to backend_opts and added
 comments

---
 example/extensions/lib_subgraph/README.md | 48 ++++++++++++++++++-----
 python/mxnet/gluon/block.py               | 16 ++++----
 src/c_api/c_api.cc                        |  7 +++-
 3 files changed, 52 insertions(+), 19 deletions(-)

diff --git a/example/extensions/lib_subgraph/README.md b/example/extensions/lib_subgraph/README.md
index a77df5a3f30a..8424ca3c7c2d 100644
--- a/example/extensions/lib_subgraph/README.md
+++ b/example/extensions/lib_subgraph/README.md
@@ -20,22 +20,30 @@ Custom Partitioner Example and Tutorial
 
 ## Introduction
 
-Adding custom model partitioners in MXNet used to require deep understanding of the MXNet backend, including operator registration and, followed by recompiling MXNet from source with all of its dependencies. This feature allows adding custom partitioners by dynamically loading custom C++ partitioners compiled in external libraries at runtime.
+Adding custom model partitioners in MXNet used to require deep understanding of the MXNet backend, including operator registration and other internal classes, followed by recompiling MXNet from source. This feature allows adding custom partitioners by dynamically loading external libraries at runtime.
 
-Custom partitioners enable users to write custom model partitioning strategies without compiling against all of MXNet header files and dependencies. When a library containing custom partitioners is loaded dynamically, the components found in the library will be re-registered in MXNet so that users can use those natively just like other built-in components.
+This custom partitioner feature, enables users to write custom model partitioning strategies without compiling against all of MXNet header files and dependencies. When a library containing custom partitioners is loaded dynamically, the components found in the library will be re-registered in MXNet so that users can use those natively just like other built-in components.
 
 ## Getting Started
 
 ### Have MXNet Ready
 
-First you should install MXNet either from compiling from source code or downloading a nightly build. It doesn’t matter if the build comes with CUDA or MKLDNN. The custom partitioning APIs do not interact with the execution of other native MXNet operators.
+the custom partitioner feature was merged recently (#15969) and is not available in versions of MXNet prior to v1.7.0. To use the feature now, please install MXNet either by compiling from source code or downloading a nightly build. For running the following example, it doesn’t matter if it is a CUDA, MKLDNN or plain MXNet build; the custom partitioner doesn’t interact with the execution of other native MXNet features. Note that if you want to write your custom partitioners running on GPU, you still need an MXNet CUDA build. 
 
 ### Run An Example
 
-You can start getting familiar with custom partitioners by running an example provided in the **example/extensions/lib_subgraph** directory. This example partitions `exp` and `log` operators into subgraphs. Go to the `lib_subgraph` directory and follow these steps:
+You can start getting familiar with custom partitioners by running an example provided in the **example/extensions/lib_subgraph** directory. This example partitions `exp` and `log` operators into subgraphs. Go to the **lib_subgraph** directory and follow these steps:
 
-1. Run `make`. The Makefile will generate a dynamic library **libsubgraph_lib.so** compiled from `subgraph_lib.cc`. This is the library you are going to load that contains everything for the custom partitioner.
-2. Run `python test_subgraph.py`. It’ll first load the above library, find the components, register them in the MXNet backend, print "Found x", then partition the model and execute the operators like a regular MXNet operator and output the result.
+1. Run `make`. The Makefile will generate the dynamic library **libsubgraph_lib.so** which is compiled from the `subgraph_lib.cc` file. This is the library you are going to load that contains everything for the custom partitioner.
+2. Run `python test_subgraph.py`. It’ll first load the above library, find the components, register them in the MXNet backend, then partition the model and execute the operators like a regular MXNet operator and output the result. Below is the output when running the `python test_subgraph.py` command. Notice that it loads 2 operators: my_gemm and state_gemm.
+
+```
+[10:38:03] src/c_api/c_api.cc:286: Found 1 operators in library
+[10:38:03] src/c_api/c_api.cc:350:       Op[0] _custom_subgraph_op
+[10:38:03] src/c_api/c_api.cc:785: Found 1 partitioners in library
+[10:38:03] src/c_api/c_api.cc:801:       Partitioner[0] myProp
+[10:38:03] src/c_api/c_api.cc:821:             Strategy[0] strategy1 subgraphOp: '_custom_subgraph_op'
+```
 
 ### Basic Files For Custom Partitioner Library
 
@@ -53,11 +61,13 @@ For building a library containing your own custom partitioner, compose a C++ sou
 - `mySupportedOps ` - Operator Support
 
 Then compile it to the `mypart_lib.so` dynamic library using the following command:
+
 ```bash
 g++ -shared -fPIC -std=c++11 mypart_lib.cc -o libmypart_lib.so -I ../../../include/mxnet
 ```
 
 Finally, you can write a Python script to load the library and partition a model with your custom partitioner:
+
 ```python
 import mxnet as mx
 mx.library.load(‘libmyop_lib.so’)
@@ -71,6 +81,24 @@ sym_block = nn.SymbolBlock(sym, inputs)
 sym_block.hybridize(backend='myPart')
 ```
 
+### Using a Custom Partitioner Library
+
+Partitioning APIs in MXNet are available in both Symbol and Gluon APIs. For the Symbol API, the `optimize_for` API can be called on Symbol objects to return a partitioned Symbol.
+
+```
+optimize_for(backend, args=None, ctx=None, **kwargs)
+```
+
+The `optimize_for` API takes at least 1 argument, `backend` which is a string that identifies which backend to partition the model for. The `args` argument is optional and takes a list of NDArray or dict of str to NDArray. It is used to infer shapes and types and before partitioning. The `ctx` argument is optional and takes a device context to infer storage types. It also take any other user-specified options that will be passed to the backend partitioning APIs.
+
+For the Gluon API, the `hybridize` API can be called on HybridBlocks to partition the internal CachedOp Symbol.
+
+```
+hybridize(backend=None, backend_opts=None)
+```
+
+When the `hybridize` function is called, Gluon will convert the program’s execution into the style used in symbolic programming. The `backend` argument is a string that identifies which backend to partition the model for. The `backend_opts` takes other user-specified options that will be passed to the backend partitioning APIs.
+
 ### Writing A Custom Partitioner
 
 There are several essential building blocks for making a custom partitioner:
@@ -87,8 +115,7 @@ There are several essential building blocks for making a custom partitioner:
                 std::string json,
                 const int num_ids,
                 int *ids,
-                std::unordered_map<std::string, 
-                                   std::string>& options)
+                std::unordered_map<std::string, std::string>& options)
 
 * [REGISTER_PARTITIONER(my_part_name)](./subgraph_lib.cc#L238):
     * This macro registers the custom partitioner and its properties to MXNet by its name. Notice that a partitioner can have multiple partitioning strategies. This enables multiple *passes* to be run in a single partitioning call from the user. The first argument to `addStrategy` is a user-specified name. The second argument is the `supportedOps` function. The third argument is the name of the subgraph operator to create for each subgraph created during partitioning (see below for more info about subgraph operators). The `setAcceptSubgraph` API registers a callback function that is called for each subgraph created during partitioning (more on this below). Notice that the first argument to this function is the strategy to associate with and the second argument is the `acceptSubgraph` function.
@@ -119,13 +146,14 @@ Let’s take a closer look at those registry functions:
 
 * **supportedOps**: This function takes four arguments. The 1st argument is a JSON string of the model architecture graph, where nodes are inputs/params/weights and edges are data dependencies. The graph is pre-sorted in topological order. When traversing the graph, operators to be partitioned into subgraphs are identified and an entry is set to `1` for the node ID in the `ids` array. Users can pass custom options to the partitioner and they are passed to the function in the `options` map. 
 
-* **acceptSubgraph**: This function takes five arguments. The 1st argument is a JSON string of the newly partitioned subgraph. It can be analyzed and accepted/rejected by setting `true`/`false` for the `accept` input. The `options` map is the same one passed to the `supportedOps` API. The `attrs` map provides an API to add user-specified attributes to the subgraph. These attributes will be available at runtime when the subgraph is executed and provides a way to pass info from partitioning-time to runtime. 
+* **acceptSubgraph**: This function takes five arguments. The 1st argument is a JSON string of the newly partitioned subgraph. It can be analyzed and accepted/rejected by setting `true`/`false` for the `accept` input. You might want to reject a subgraph if it doesnt include all the operators you want, for example. The `options` map is the same one passed to the `supportedOps` API. The `attrs` map provides an API to add user-specified attributes to the subgraph. These attributes will be available at runtime when the subgraph is executed and provides a way to pass info from partitioning-time to runtime. 
 
 ### Writing A Custom Subgraph Operator
 
 A partitioning strategy specifies how to partition a model and isolate operators into subgraphs. In MXNet, subgraphs are just a [stateful operator](../lib_custom_op#writing-stateful-custom-operator). Subgraph operators have an extra attribute called `SUBGRAPH_SYM_JSON` that maps to a JSON string of the subgraph. The expectation is that when a subgraph operator executes a forward/backward call, it executes all of the operators in the subgraph. 
 
 When registering a custom subgraph operator, all thats needed is to register a `createOpState` function and to set that the operator is a subgraph operator by calling the `setIsSubgraphOp` API like:
+
 ```
 REGISTER_OP(my_subgraph_op)
 .setIsSubgraphOp()
@@ -135,12 +163,14 @@ REGISTER_OP(my_subgraph_op)
 ### Parsing a JSON string
 
 To simplify custom partitioner libraries, basic JSON parsing utility functions have been implemented in the `lib_api.h` header file. You create a `JsonParser` object and parse the string by calling the `parse_to_json` API like:
+
 ```c++
 JsonParser parser;
 JsonVal json_val = parser.parse_to_json(json_string);
 ```
 
 A `JsonVal` is a class that represents the nodes in a JSON structure. You can check the type of a node (num, str, list, or map) by comparing the `JsonVal.type` to `STR`, `NUM`, `LIST`, or `MAP`. Then you can get that value from the node like:
+
 ```c++
 switch(json_val.type) {
   case STR:
diff --git a/python/mxnet/gluon/block.py b/python/mxnet/gluon/block.py
index f13993eb3fc3..e925b31a280f 100644
--- a/python/mxnet/gluon/block.py
+++ b/python/mxnet/gluon/block.py
@@ -879,7 +879,7 @@ def __init__(self, prefix=None, params=None):
         self._callback = None
         self._monitor_all = False
         self._backend = None
-        self._backend_args = {}
+        self._backend_opts = {}
 
     def __setattr__(self, name, value):
         """Registers parameters."""
@@ -974,7 +974,7 @@ def _build_cache(self, *args):
             arg_array = [args[data_names[name]] if name in data_names.keys() else params[name].data()
                          for name in out.list_arguments()]
             # Partition the graph.
-            out = out.optimize_for(self._backend, arg_array, ctx, **self._backend_args)
+            out = out.optimize_for(self._backend, arg_array, ctx, **self._backend_opts)
 
         self._cached_op = ndarray.CachedOp(out, flags)
 
@@ -1040,7 +1040,7 @@ def register_child(self, block, name=None):
         super(HybridBlock, self).register_child(block, name)
         self._clear_cached_op()
 
-    def hybridize(self, active=True, backend=None, backend_args=None, **kwargs):
+    def hybridize(self, active=True, backend=None, backend_opts=None, **kwargs):
         """Activates or deactivates :py:class:`HybridBlock` s recursively. Has no effect on
         non-hybrid children.
 
@@ -1050,7 +1050,7 @@ def hybridize(self, active=True, backend=None, backend_args=None, **kwargs):
             Whether to turn hybrid on or off.
         backend : str
             The name of backend, as registered in `SubgraphBackendRegistry`, default None
-        backend_args : dict of arguments, optional
+        backend_opts : dict of user-specified options to pass to the backend for partitioning, optional
             Passed on to `PrePartition` and `PostPartition` functions of `SubgraphProperty`
         static_alloc : bool, default False
             Statically allocate memory to improve speed. Memory usage may increase.
@@ -1061,10 +1061,10 @@ def hybridize(self, active=True, backend=None, backend_args=None, **kwargs):
         """
 
         self._backend = backend
-        if backend_args is not None:
-            assert isinstance(backend_args, dict), \
-            "HybridBlock hybridize requires backend_args to be a dictionary."
-            self._backend_args = backend_args
+        if backend_opts is not None:
+            assert isinstance(backend_opts, dict), \
+            "HybridBlock hybridize requires backend_opts to be a dictionary."
+            self._backend_opts = backend_opts
 
         self._active = active
         self._flags = list(kwargs.items())
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 370785f74860..ef4db2fbd1e2 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -435,7 +435,7 @@ int MXLoadLib(const char *path) {
       CHECK(callParseAttrs(parse_fp, attr_keys.data(), attr_vals.data(), attr_keys.size(),
                            &num_in, &num_out))
       << "Error calling ParseAttrs::num_outputs for custom operator '" << name_str << "'";
-
+      // for backward passes, inputs + outputs + input gradients (one for each output)
       return num_in + 2 * num_out;
     };
 
@@ -760,9 +760,10 @@ int MXLoadLib(const char *path) {
         gradOp.set_num_inputs(num_inouts);
         gradOp.set_num_outputs(num_inputs);
       } else {
-        // for subgraph ops use special functions
+        // for subgraph ops use special functions that do not invoke attr_parser
         using namespace mxnet::op;
         auto grad_inouts = [=](const nnvm::NodeAttrs& attrs) {
+          // for backward passes, inputs + outputs + input gradients (one for each output)
           uint32_t cnt = DefaultSubgraphOpNumInputs(attrs);
           cnt += 2 * DefaultSubgraphOpNumOutputs(attrs);
           return cnt;
@@ -772,6 +773,7 @@ int MXLoadLib(const char *path) {
       }
 
       if (createop_map.size() != 0) {
+        // for stateful operators
         gradOp.set_attr<bool>("TIsLayerOpBackward", true, plevel);
         auto fstate_backward = [=](const OpStatePtr& state_ptr,
                                    const OpContext& ctx,
@@ -785,6 +787,7 @@ int MXLoadLib(const char *path) {
         gradOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstate_backward, plevel);
         gradOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", fstate_backward, plevel);
       } else {
+        // for stateless operators
         if (backward_ctx_map.count("cpu") > 0) {
           fcomp_t fcomp_back_cpu = backward_ctx_map.at("cpu");
           auto backward_cpu_lambda = [=](const nnvm::NodeAttrs& attrs,

From b08dc9c813125fe19f184b05929c555f2d78c641 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Tue, 18 Feb 2020 19:25:30 +0000
Subject: [PATCH 08/19] added Gluon test for custom subgraph library unittests

---
 tests/python/unittest/test_extensions.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/tests/python/unittest/test_extensions.py b/tests/python/unittest/test_extensions.py
index 39ad9d03d470..57c41e0302be 100644
--- a/tests/python/unittest/test_extensions.py
+++ b/tests/python/unittest/test_extensions.py
@@ -22,6 +22,8 @@
 import unittest
 import mxnet as mx
 import numpy as np
+from mxnet import nd
+from mxnet.gluon import nn
 from mxnet.base import MXNetError
 from mxnet.test_utils import download, is_cd_run, assert_almost_equal, default_context
 
@@ -157,6 +159,14 @@ def test_subgraph():
     # check that result matches one executed by MXNet
     assert_almost_equal(out[0].asnumpy(), out3[0].asnumpy(), rtol=1e-3, atol=1e-3)
 
+    # Gluon Hybridize partitioning with shapes/types
+    sym_block = nn.SymbolBlock(sym, [a,b])
+    sym_block.initialize()
+    sym_block.hybridize(backend='myProp')
+    out4 = sym_block(mx.nd.ones((3,2)),mx.nd.ones((3,2)))
+    # check that result matches one executed by MXNet
+    assert_almost_equal(out[0].asnumpy(), out4[0].asnumpy(), rtol=1e-3, atol=1e-3)
+
 @unittest.skipIf(check_platform(), "not all machine types supported")
 @unittest.skipIf(is_cd_run(), "continuous delivery run - ignoring test")
 @unittest.skipIf(default_context().device_type == 'cpu', "ignoring custom_op_gpu test on cpu run")

From 07837c457d87349d1d872cc8c53d584f4444d95b Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Tue, 18 Feb 2020 22:33:16 +0000
Subject: [PATCH 09/19] moved gpu custom operator tests to different file

---
 tests/python/gpu/test_extensions_gpu.py  | 75 ++++++++++++++++++++++++
 tests/python/gpu/test_operator_gpu.py    |  3 +-
 tests/python/unittest/test_extensions.py | 44 --------------
 3 files changed, 76 insertions(+), 46 deletions(-)
 create mode 100644 tests/python/gpu/test_extensions_gpu.py

diff --git a/tests/python/gpu/test_extensions_gpu.py b/tests/python/gpu/test_extensions_gpu.py
new file mode 100644
index 000000000000..43a15675da68
--- /dev/null
+++ b/tests/python/gpu/test_extensions_gpu.py
@@ -0,0 +1,75 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# This test checks if dynamic loading of library into MXNet is successful
+
+import os
+import platform
+import unittest
+import mxnet as mx
+import numpy as np
+from mxnet import nd
+from mxnet.gluon import nn
+from mxnet.base import MXNetError
+from mxnet.test_utils import download, is_cd_run, assert_almost_equal, default_context
+
+def check_platform():
+    return platform.machine() not in ['x86_64', 'AMD64']
+
+@unittest.skipIf(check_platform(), "not all machine types supported")
+@unittest.skipIf(is_cd_run(), "continuous delivery run - ignoring test")
+@unittest.skipIf(default_context().device_type == 'cpu', "ignoring custom_op_gpu test on cpu run")
+def test_custom_op_gpu():
+    # possible places to find library file
+    if (os.name=='posix'):
+        lib = 'libcustomop_gpu_lib.so'
+        if os.path.exists(lib):
+            fname = lib
+        elif os.path.exists('build/'+lib):
+            fname = 'build/'+lib
+        else:
+            raise MXNetError("library %s not found " % lib)
+    elif (os.name=='nt'):
+        lib = 'libcustomop_gpu_lib.dll'
+        if os.path.exists('windows_package\\lib\\'+lib):
+            fname = 'windows_package\\lib\\'+lib
+        else:
+            raise MXNetError("library %s not found " % lib)
+
+    fname = os.path.abspath(fname)
+    # load the library containing gemm custom operators
+    mx.library.load(fname)
+
+    # test symbol custom relu operator in gpu
+    b = mx.nd.array([[-2,-1],[1,2]], ctx=mx.gpu())
+    c = mx.sym.Variable('c')
+    d = mx.sym.Variable('d')
+    e = mx.sym.my_relu(c)
+    base = mx.sym.relu(d)
+    in_grad = [mx.nd.empty((2,2), ctx=mx.gpu())]
+    in_grad_base = [mx.nd.empty((2,2), ctx=mx.gpu())]
+    exe = e.bind(ctx=mx.gpu(), args={'c':b}, args_grad=in_grad)
+    exe_base = base.bind(ctx=mx.gpu(), args={'d':b}, args_grad=in_grad_base)
+    out = exe.forward()
+    out_base = exe_base.forward()
+    assert_almost_equal(out_base[0].asnumpy(), out[0].asnumpy(), rtol=1e-3, atol=1e-3)
+
+    # test backward
+    out_grad = mx.nd.ones((2,2), ctx=mx.gpu())
+    exe.backward([out_grad])
+    exe_base.backward([out_grad])
+    assert_almost_equal(in_grad_base[0].asnumpy(), in_grad[0].asnumpy(), rtol=1e-3, atol=1e-3)
diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index e77427f033c1..830281190577 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -46,8 +46,8 @@
 from test_subgraph_op import *
 from test_gluon_gpu import _test_bulking
 from test_contrib_operator import test_multibox_target_op
+from test_extensions_gpu import *
 from test_tvm_op import *
-from test_extensions import *
 from test_contrib_optimizer import test_adamw
 
 set_default_context(mx.gpu(0))
@@ -55,7 +55,6 @@
 del test_support_vector_machine_l2_svm  # noqa
 del test_custom_op_fork  #noqa
 
-
 def check_countsketch(in_dim,out_dim,n):
     data = mx.sym.Variable("data")
     h = mx.sym.Variable("h")
diff --git a/tests/python/unittest/test_extensions.py b/tests/python/unittest/test_extensions.py
index 57c41e0302be..726cb6a998a3 100644
--- a/tests/python/unittest/test_extensions.py
+++ b/tests/python/unittest/test_extensions.py
@@ -166,47 +166,3 @@ def test_subgraph():
     out4 = sym_block(mx.nd.ones((3,2)),mx.nd.ones((3,2)))
     # check that result matches one executed by MXNet
     assert_almost_equal(out[0].asnumpy(), out4[0].asnumpy(), rtol=1e-3, atol=1e-3)
-
-@unittest.skipIf(check_platform(), "not all machine types supported")
-@unittest.skipIf(is_cd_run(), "continuous delivery run - ignoring test")
-@unittest.skipIf(default_context().device_type == 'cpu', "ignoring custom_op_gpu test on cpu run")
-def test_custom_op_gpu():
-    # possible places to find library file
-    if (os.name=='posix'):
-        lib = 'libcustomop_gpu_lib.so'
-        if os.path.exists(lib):
-            fname = lib
-        elif os.path.exists('build/'+lib):
-            fname = 'build/'+lib
-        else:
-            raise MXNetError("library %s not found " % lib)
-    elif (os.name=='nt'):
-        lib = 'libcustomop_gpu_lib.dll'
-        if os.path.exists('windows_package\\lib\\'+lib):
-            fname = 'windows_package\\lib\\'+lib
-        else:
-            raise MXNetError("library %s not found " % lib)
-
-    fname = os.path.abspath(fname)
-    # load the library containing gemm custom operators
-    mx.library.load(fname)
-
-    # test symbol custom relu operator in gpu
-    b = mx.nd.array([[-2,-1],[1,2]], ctx=mx.gpu())
-    c = mx.sym.Variable('c')
-    d = mx.sym.Variable('d')
-    e = mx.sym.my_relu(c)
-    base = mx.sym.relu(d)
-    in_grad = [mx.nd.empty((2,2), ctx=mx.gpu())]
-    in_grad_base = [mx.nd.empty((2,2), ctx=mx.gpu())]
-    exe = e.bind(ctx=mx.gpu(), args={'c':b}, args_grad=in_grad)
-    exe_base = base.bind(ctx=mx.gpu(), args={'d':b}, args_grad=in_grad_base)
-    out = exe.forward()
-    out_base = exe_base.forward()
-    assert_almost_equal(out_base[0].asnumpy(), out[0].asnumpy(), rtol=1e-3, atol=1e-3)
-
-    # test backward
-    out_grad = mx.nd.ones((2,2), ctx=mx.gpu())
-    exe.backward([out_grad])
-    exe_base.backward([out_grad])
-    assert_almost_equal(in_grad_base[0].asnumpy(), in_grad[0].asnumpy(), rtol=1e-3, atol=1e-3)

From 64ad1eac5cb826e9014771afb1e373f89dd73725 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Tue, 18 Feb 2020 22:33:41 +0000
Subject: [PATCH 10/19] fixed stateful op registration context support

---
 src/c_api/c_api.cc | 47 +++++++++++++++++++++++-----------------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index ef4db2fbd1e2..adebe085fd1d 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -717,33 +717,32 @@ int MXLoadLib(const char *path) {
         CustomFComputeDispatcher(name_str, nullptr, nullptr, nullptr,
                                  callFStatefulComp, 1, &state_ptr, ctx, inputs, req, outputs);
       };
-      regOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstate_forward, plevel);
-      regOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", fstate_forward, plevel);
+      if (createop_map.count("cpu") > 0)
+        regOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<cpu>", fstate_forward, plevel);
+      if (createop_map.count("gpu") > 0)
+        regOp.set_attr<FStatefulComputeEx>("FStatefulComputeEx<gpu>", fstate_forward, plevel);
     } else {
-      if (forward_ctx_map.count("cpu") > 0) {
-        fcomp_t fcomp_cpu = forward_ctx_map.at("cpu");
-        auto forward_cpu_lambda = [=](const nnvm::NodeAttrs& attrs,
-                                      const OpContext& ctx,
-                                      const std::vector<NDArray>& inputs,
-                                      const std::vector<OpReqType>& req,
-                                      const std::vector<NDArray>& outputs) {
-          CustomFComputeDispatcher(name_str, callFComp, fcomp_cpu, &attrs,
+      auto forward_lambda = [=](const nnvm::NodeAttrs& attrs,
+                                const OpContext& ctx,
+                                const std::vector<NDArray>& inputs,
+                                const std::vector<OpReqType>& req,
+                                const std::vector<NDArray>& outputs) {
+        if (ctx.dev_mask() == Context::kCPU) {
+          CHECK(forward_ctx_map.count("cpu") > 0)
+          fcomp_t fcomp = forward_ctx_map.at("cpu");
+          CustomFComputeDispatcher(name_str, callFComp, fcomp, &attrs,
                                    nullptr, 0, nullptr, ctx, inputs, req, outputs);
-        };
-        regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_cpu_lambda, plevel);
-      }
-      if (forward_ctx_map.count("gpu") > 0) {
-        fcomp_t fcomp_gpu = forward_ctx_map.at("gpu");
-        auto forward_gpu_lambda = [=](const nnvm::NodeAttrs& attrs,
-                                      const OpContext& ctx,
-                                      const std::vector<NDArray>& inputs,
-                                      const std::vector<OpReqType>& req,
-                                      const std::vector<NDArray>& outputs) {
-          CustomFComputeDispatcher(name_str, callFComp, fcomp_gpu, &attrs,
+        } else if (ctx.dev_mask() == Context::kGPU) {
+          CHECK(forward_ctx_map.count("gpu") > 0)
+          fcomp_t fcomp = forward_ctx_map.at("gpu");
+          CustomFComputeDispatcher(name_str, callFComp, fcomp, &attrs,
                                    nullptr, 0, nullptr, ctx, inputs, req, outputs);
-        };
-        regOp.set_attr<FComputeEx>("FComputeEx<gpu>", forward_gpu_lambda, plevel);
-      }
+        }
+      };
+      if (forward_ctx_map.count("cpu") > 0)
+        regOp.set_attr<FComputeEx>("FComputeEx<cpu>", forward_lambda, plevel);
+      if (forward_ctx_map.count("gpu") > 0)
+        regOp.set_attr<FComputeEx>("FComputeEx<gpu>", forward_lambda, plevel);
     }
     // optionally add fgradient if user specified a function, or for stateful ops
     if (backward_ctx_map.size() != 0 || createop_map.size() != 0) {

From b089befd983c3ac918e8ba366d2be20cfc7ffd34 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Tue, 18 Feb 2020 22:41:37 +0000
Subject: [PATCH 11/19] unlinked test_operator_gpu.py with
 test_extensions_gpu.py

---
 tests/python/gpu/test_operator_gpu.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/python/gpu/test_operator_gpu.py b/tests/python/gpu/test_operator_gpu.py
index 830281190577..a017b8ce59c2 100644
--- a/tests/python/gpu/test_operator_gpu.py
+++ b/tests/python/gpu/test_operator_gpu.py
@@ -46,7 +46,6 @@
 from test_subgraph_op import *
 from test_gluon_gpu import _test_bulking
 from test_contrib_operator import test_multibox_target_op
-from test_extensions_gpu import *
 from test_tvm_op import *
 from test_contrib_optimizer import test_adamw
 

From da11728eb088342cf8eba99e17f39506a747f57f Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Tue, 18 Feb 2020 22:42:44 +0000
Subject: [PATCH 12/19] removed cpu context check since its in gpu folder now

---
 tests/python/gpu/test_extensions_gpu.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/tests/python/gpu/test_extensions_gpu.py b/tests/python/gpu/test_extensions_gpu.py
index 43a15675da68..f76d0e33e7dc 100644
--- a/tests/python/gpu/test_extensions_gpu.py
+++ b/tests/python/gpu/test_extensions_gpu.py
@@ -32,7 +32,6 @@ def check_platform():
 
 @unittest.skipIf(check_platform(), "not all machine types supported")
 @unittest.skipIf(is_cd_run(), "continuous delivery run - ignoring test")
-@unittest.skipIf(default_context().device_type == 'cpu', "ignoring custom_op_gpu test on cpu run")
 def test_custom_op_gpu():
     # possible places to find library file
     if (os.name=='posix'):

From 86483ca533130c3eb74c8f2d9c64c942f42fc24d Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Tue, 18 Feb 2020 22:54:44 +0000
Subject: [PATCH 13/19] fixed context

---
 src/c_api/c_api.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index adebe085fd1d..d1931890fc5e 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -727,13 +727,13 @@ int MXLoadLib(const char *path) {
                                 const std::vector<NDArray>& inputs,
                                 const std::vector<OpReqType>& req,
                                 const std::vector<NDArray>& outputs) {
-        if (ctx.dev_mask() == Context::kCPU) {
-          CHECK(forward_ctx_map.count("cpu") > 0)
+        if (ctx.run_ctx.ctx.dev_mask() == Context::kCPU) {
+          CHECK(forward_ctx_map.count("cpu") > 0);
           fcomp_t fcomp = forward_ctx_map.at("cpu");
           CustomFComputeDispatcher(name_str, callFComp, fcomp, &attrs,
                                    nullptr, 0, nullptr, ctx, inputs, req, outputs);
-        } else if (ctx.dev_mask() == Context::kGPU) {
-          CHECK(forward_ctx_map.count("gpu") > 0)
+        } else if (ctx.run_ctx.ctx.dev_mask() == Context::kGPU) {
+          CHECK(forward_ctx_map.count("gpu") > 0);
           fcomp_t fcomp = forward_ctx_map.at("gpu");
           CustomFComputeDispatcher(name_str, callFComp, fcomp, &attrs,
                                    nullptr, 0, nullptr, ctx, inputs, req, outputs);

From d2e718c192bbc5a7d750bf0c908765a9ce72a803 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Wed, 19 Feb 2020 01:09:54 +0000
Subject: [PATCH 14/19] fixed whitespace

---
 src/c_api/c_api.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index d1931890fc5e..7b58fea0fa09 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -728,12 +728,12 @@ int MXLoadLib(const char *path) {
                                 const std::vector<OpReqType>& req,
                                 const std::vector<NDArray>& outputs) {
         if (ctx.run_ctx.ctx.dev_mask() == Context::kCPU) {
-          CHECK(forward_ctx_map.count("cpu") > 0);
+          CHECK_GT(forward_ctx_map.count("cpu"), 0);
           fcomp_t fcomp = forward_ctx_map.at("cpu");
           CustomFComputeDispatcher(name_str, callFComp, fcomp, &attrs,
                                    nullptr, 0, nullptr, ctx, inputs, req, outputs);
         } else if (ctx.run_ctx.ctx.dev_mask() == Context::kGPU) {
-          CHECK(forward_ctx_map.count("gpu") > 0);
+          CHECK_GT(forward_ctx_map.count("gpu"), 0);
           fcomp_t fcomp = forward_ctx_map.at("gpu");
           CustomFComputeDispatcher(name_str, callFComp, fcomp, &attrs,
                                    nullptr, 0, nullptr, ctx, inputs, req, outputs);

From 4a1378989af47a9208d9cdebeb9f98b1920e21c0 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Wed, 19 Feb 2020 06:41:02 +0000
Subject: [PATCH 15/19] changed acceptSubgraph API to use vector of bool

---
 example/extensions/lib_subgraph/subgraph_lib.cc |  5 ++---
 include/mxnet/lib_api.h                         | 14 ++++++++++++--
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/example/extensions/lib_subgraph/subgraph_lib.cc b/example/extensions/lib_subgraph/subgraph_lib.cc
index 8f339159fbc5..321777851221 100644
--- a/example/extensions/lib_subgraph/subgraph_lib.cc
+++ b/example/extensions/lib_subgraph/subgraph_lib.cc
@@ -177,8 +177,7 @@ REGISTER_OP(_custom_subgraph_op)
 const std::vector<std::string> op_names({"exp","log"});
 
 MXReturnValue mySupportedOps(std::string json,
-                             const int num_ids,
-                             int *ids,
+                             std::vector<bool> ids,
                              std::unordered_map<std::string, std::string>& options) {
   for (auto kv : options) {
     std::cout << "option: " << kv.first << " ==> " << kv.second << std::endl;
@@ -210,7 +209,7 @@ MXReturnValue mySupportedOps(std::string json,
       //check if op is in whitelist
       if(std::find(op_names.begin(),op_names.end(),op.str.c_str()) != op_names.end()) {
         // found op in whitelist, set value to 1 to include op in subgraph
-        ids[i]=1;
+        ids[i]=true;
       }
     }
   }
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index aeb5f79e2f70..3a4667073183 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -713,7 +713,7 @@ class CustomOp {
 };
 
 /*! \brief Custom Subgraph Create function template */
-typedef MXReturnValue (*supportedOps_t)(std::string, int, int*,
+typedef MXReturnValue (*supportedOps_t)(std::string, std::vector<bool>,
                                         std::unordered_map<std::string, std::string>&);
 typedef MXReturnValue (*acceptSubgraph_t)(std::string, int, bool*,
                                           std::unordered_map<std::string, std::string>&,
@@ -1271,7 +1271,17 @@ extern "C" {
     for (int i = 0; i < num_opts; i++) {
       opts[std::string(opt_keys[i])] = std::string(opt_vals[i]);
     }
-    return supportedOps(subgraph_json, num_ids, ids, opts);
+    // create array of bools for operator support
+    std::vector<bool> _ids(num_ids, false);
+    // call user's supportedOps function
+    MXReturnValue retval = supportedOps(subgraph_json, _ids, opts);
+    if (!retval) return retval;
+    
+    // copy bools in ids to ints
+    for (int i = 0; i < num_ids; i++)
+      ids[i] = _ids[i];
+
+    return retval;
   }
 
     /*! \brief returns status of calling parse attributes function for operator from library */

From 83414a33953105ffe7d4870d573d0fd28989b122 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Wed, 19 Feb 2020 06:41:24 +0000
Subject: [PATCH 16/19] addressed Lin's comments

---
 example/extensions/lib_subgraph/README.md | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/example/extensions/lib_subgraph/README.md b/example/extensions/lib_subgraph/README.md
index 8424ca3c7c2d..9d7cbb54e602 100644
--- a/example/extensions/lib_subgraph/README.md
+++ b/example/extensions/lib_subgraph/README.md
@@ -22,13 +22,13 @@ Custom Partitioner Example and Tutorial
 
 Adding custom model partitioners in MXNet used to require deep understanding of the MXNet backend, including operator registration and other internal classes, followed by recompiling MXNet from source. This feature allows adding custom partitioners by dynamically loading external libraries at runtime.
 
-This custom partitioner feature, enables users to write custom model partitioning strategies without compiling against all of MXNet header files and dependencies. When a library containing custom partitioners is loaded dynamically, the components found in the library will be re-registered in MXNet so that users can use those natively just like other built-in components.
+This custom partitioner feature enables users to write custom model partitioning strategies without compiling against all of MXNet header files and dependencies. When a library containing custom partitioners is loaded dynamically, the components found in the library will be re-registered in MXNet so that users can use those natively just like other built-in components.
 
 ## Getting Started
 
 ### Have MXNet Ready
 
-the custom partitioner feature was merged recently (#15969) and is not available in versions of MXNet prior to v1.7.0. To use the feature now, please install MXNet either by compiling from source code or downloading a nightly build. For running the following example, it doesn’t matter if it is a CUDA, MKLDNN or plain MXNet build; the custom partitioner doesn’t interact with the execution of other native MXNet features. Note that if you want to write your custom partitioners running on GPU, you still need an MXNet CUDA build. 
+The custom partitioner feature was merged recently (#15969) and is not available in versions of MXNet prior to v1.7.0. To use the feature now, please install MXNet either by installing the nightly pip wheel or compiling from source. For running the following example, it doesn’t matter if it is a CUDA, MKLDNN or plain MXNet build; the custom partitioner doesn’t interact with the execution of other native MXNet features. Note that if you want to write your custom partitioners running on GPU, you still need an MXNet CUDA build. 
 
 ### Run An Example
 
@@ -113,8 +113,7 @@ There are several essential building blocks for making a custom partitioner:
 
             MXReturnValue supportedOps(
                 std::string json,
-                const int num_ids,
-                int *ids,
+                std::vector<bool>& ids,
                 std::unordered_map<std::string, std::string>& options)
 
 * [REGISTER_PARTITIONER(my_part_name)](./subgraph_lib.cc#L238):
@@ -144,9 +143,9 @@ Also there are some optional functions you can specify:
 
 Let’s take a closer look at those registry functions:
 
-* **supportedOps**: This function takes four arguments. The 1st argument is a JSON string of the model architecture graph, where nodes are inputs/params/weights and edges are data dependencies. The graph is pre-sorted in topological order. When traversing the graph, operators to be partitioned into subgraphs are identified and an entry is set to `1` for the node ID in the `ids` array. Users can pass custom options to the partitioner and they are passed to the function in the `options` map. 
+* **supportedOps**: This function takes four arguments. The 1st argument is a JSON string of the model architecture graph, where nodes are inputs/params/weights and edges are data dependencies. The graph is pre-sorted in topological order. The 2nd argument is an array of booleans, one for each operator in the model. When traversing the graph, operators to be partitioned into subgraphs are identified and an entry is set to `true` for the node ID in the `ids` array. The last argument is the map of options specified by the user. Users can pass custom options to the partitioner and they are passed to this function in the `options` map. 
 
-* **acceptSubgraph**: This function takes five arguments. The 1st argument is a JSON string of the newly partitioned subgraph. It can be analyzed and accepted/rejected by setting `true`/`false` for the `accept` input. You might want to reject a subgraph if it doesnt include all the operators you want, for example. The `options` map is the same one passed to the `supportedOps` API. The `attrs` map provides an API to add user-specified attributes to the subgraph. These attributes will be available at runtime when the subgraph is executed and provides a way to pass info from partitioning-time to runtime. 
+* **acceptSubgraph**: This function takes five arguments. The 1st argument is a JSON string of the newly partitioned subgraph. The 2nd argument is the subgraph ID, this is just a number MXNet uses to identify this particular subgraph (it starts at zero and increments). The 3rd argument is an output to be set in this function to tell MXNet whether to accept (value: `true`) or reject (value: `false`) the subgraph. The 4th argument is the map of options specified by the user. The last argument is a map of attributes that should be set on the created subgraph. These attributes will be available later at runtime, and provides a mechanisn to pass info from partition-time to runtime. You might want to reject a subgraph if it doesnt include all the operators you want, for example. The `options` map is the same one passed to the `supportedOps` API.
 
 ### Writing A Custom Subgraph Operator
 

From 61bf3028288f01f62d0e4f07b79146dfcba97527 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Wed, 19 Feb 2020 06:46:02 +0000
Subject: [PATCH 17/19] fixed whitespace

---
 include/mxnet/lib_api.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 3a4667073183..04c86da2d6ce 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -1276,7 +1276,7 @@ extern "C" {
     // call user's supportedOps function
     MXReturnValue retval = supportedOps(subgraph_json, _ids, opts);
     if (!retval) return retval;
-    
+
     // copy bools in ids to ints
     for (int i = 0; i < num_ids; i++)
       ids[i] = _ids[i];

From 9afe2786e0bef48e8420fc8fc02d6adb7250384e Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Wed, 19 Feb 2020 07:57:20 +0000
Subject: [PATCH 18/19] changed acceptSubgraph to reviewSubgraph

---
 example/extensions/lib_subgraph/README.md     | 12 ++++----
 .../extensions/lib_subgraph/subgraph_lib.cc   |  4 +--
 include/mxnet/lib_api.h                       | 28 +++++++++----------
 src/c_api/c_api.cc                            | 11 ++++----
 .../partitioner/custom_subgraph_property.h    | 22 +++++++--------
 5 files changed, 38 insertions(+), 39 deletions(-)

diff --git a/example/extensions/lib_subgraph/README.md b/example/extensions/lib_subgraph/README.md
index 9d7cbb54e602..b113be267fd3 100644
--- a/example/extensions/lib_subgraph/README.md
+++ b/example/extensions/lib_subgraph/README.md
@@ -117,22 +117,22 @@ There are several essential building blocks for making a custom partitioner:
                 std::unordered_map<std::string, std::string>& options)
 
 * [REGISTER_PARTITIONER(my_part_name)](./subgraph_lib.cc#L238):
-    * This macro registers the custom partitioner and its properties to MXNet by its name. Notice that a partitioner can have multiple partitioning strategies. This enables multiple *passes* to be run in a single partitioning call from the user. The first argument to `addStrategy` is a user-specified name. The second argument is the `supportedOps` function. The third argument is the name of the subgraph operator to create for each subgraph created during partitioning (see below for more info about subgraph operators). The `setAcceptSubgraph` API registers a callback function that is called for each subgraph created during partitioning (more on this below). Notice that the first argument to this function is the strategy to associate with and the second argument is the `acceptSubgraph` function.
+    * This macro registers the custom partitioner and its properties to MXNet by its name. Notice that a partitioner can have multiple partitioning strategies. This enables multiple *passes* to be run in a single partitioning call from the user. The first argument to `addStrategy` is a user-specified name. The second argument is the `supportedOps` function. The third argument is the name of the subgraph operator to create for each subgraph created during partitioning (see below for more info about subgraph operators). The `setReviewSubgraph` API registers a callback function that is called for each subgraph created during partitioning (more on this below). Notice that the first argument to this function is the strategy to associate with and the second argument is the `reviewSubgraph` function.
 
             REGISTER_PARTITIONER(my_part_name)
             .addStrategy("strategy1", 
                           supportedOps, 
                           "_custom_subgraph_op")
-            .setAcceptSubgraph("strategy1", 
-                                acceptSubgraph);
+            .setReviewSubgraph("strategy1", 
+                                reviewSubgraph);
 
 
 Also there are some optional functions you can specify:
 
-* [acceptSubgraph](./subgraph_lib.cc#L220):
+* [reviewSubgraph](./subgraph_lib.cc#L220):
     * This function provides an opportunity to accept/reject a subgraph after MXNet partitions it. It also allows specifying custom attributes on the subgraph (ie. user-generated IDs). If you do not register this function, subgraphs will be accepted by default. 
 
-            MXReturnValue acceptSubgraph(
+            MXReturnValue reviewSubgraph(
                 std::string json,
                 int subraph_id,
                 bool* accept,
@@ -145,7 +145,7 @@ Let’s take a closer look at those registry functions:
 
 * **supportedOps**: This function takes four arguments. The 1st argument is a JSON string of the model architecture graph, where nodes are inputs/params/weights and edges are data dependencies. The graph is pre-sorted in topological order. The 2nd argument is an array of booleans, one for each operator in the model. When traversing the graph, operators to be partitioned into subgraphs are identified and an entry is set to `true` for the node ID in the `ids` array. The last argument is the map of options specified by the user. Users can pass custom options to the partitioner and they are passed to this function in the `options` map. 
 
-* **acceptSubgraph**: This function takes five arguments. The 1st argument is a JSON string of the newly partitioned subgraph. The 2nd argument is the subgraph ID, this is just a number MXNet uses to identify this particular subgraph (it starts at zero and increments). The 3rd argument is an output to be set in this function to tell MXNet whether to accept (value: `true`) or reject (value: `false`) the subgraph. The 4th argument is the map of options specified by the user. The last argument is a map of attributes that should be set on the created subgraph. These attributes will be available later at runtime, and provides a mechanisn to pass info from partition-time to runtime. You might want to reject a subgraph if it doesnt include all the operators you want, for example. The `options` map is the same one passed to the `supportedOps` API.
+* **reviewSubgraph**: This function takes five arguments. The 1st argument is a JSON string of the newly partitioned subgraph. The 2nd argument is the subgraph ID, this is just a number MXNet uses to identify this particular subgraph (it starts at zero and increments). The 3rd argument is an output to be set in this function to tell MXNet whether to accept (value: `true`) or reject (value: `false`) the subgraph. The 4th argument is the map of options specified by the user. The last argument is a map of attributes that should be set on the created subgraph. These attributes will be available later at runtime, and provides a mechanisn to pass info from partition-time to runtime. You might want to reject a subgraph if it doesnt include all the operators you want, for example. The `options` map is the same one passed to the `supportedOps` API.
 
 ### Writing A Custom Subgraph Operator
 
diff --git a/example/extensions/lib_subgraph/subgraph_lib.cc b/example/extensions/lib_subgraph/subgraph_lib.cc
index 321777851221..da888fd10383 100644
--- a/example/extensions/lib_subgraph/subgraph_lib.cc
+++ b/example/extensions/lib_subgraph/subgraph_lib.cc
@@ -216,7 +216,7 @@ MXReturnValue mySupportedOps(std::string json,
   return MX_SUCCESS;
 }
 
-MXReturnValue myAcceptSubgraph(std::string json, int subraph_id, bool* accept,
+MXReturnValue myReviewSubgraph(std::string json, int subraph_id, bool* accept,
                                std::unordered_map<std::string, std::string>& options,
                                std::unordered_map<std::string, std::string>& attrs) {
   for (auto kv : options) {
@@ -237,7 +237,7 @@ MXReturnValue myAcceptSubgraph(std::string json, int subraph_id, bool* accept,
 
 REGISTER_PARTITIONER(myProp)
 .addStrategy("strategy1", mySupportedOps, "_custom_subgraph_op")
-.setAcceptSubgraph("strategy1", myAcceptSubgraph);
+.setReviewSubgraph("strategy1", myReviewSubgraph);
 
 MXReturnValue initialize(int version) {
   if (version >= 10400) {
diff --git a/include/mxnet/lib_api.h b/include/mxnet/lib_api.h
index 04c86da2d6ce..d59f2b12da37 100644
--- a/include/mxnet/lib_api.h
+++ b/include/mxnet/lib_api.h
@@ -715,7 +715,7 @@ class CustomOp {
 /*! \brief Custom Subgraph Create function template */
 typedef MXReturnValue (*supportedOps_t)(std::string, std::vector<bool>,
                                         std::unordered_map<std::string, std::string>&);
-typedef MXReturnValue (*acceptSubgraph_t)(std::string, int, bool*,
+typedef MXReturnValue (*reviewSubgraph_t)(std::string, int, bool*,
                                           std::unordered_map<std::string, std::string>&,
                                           std::unordered_map<std::string, std::string>&);
 
@@ -735,21 +735,21 @@ class CustomPartitioner {
     op_names.push_back(sg_name);
     return *this;
   }
-  CustomPartitioner& setAcceptSubgraph(const char* prop_name, acceptSubgraph_t fn) {
-    accept_map[std::string(prop_name)] = fn;
+  CustomPartitioner& setReviewSubgraph(const char* prop_name, reviewSubgraph_t fn) {
+    review_map[std::string(prop_name)] = fn;
     return *this;
   }
-  acceptSubgraph_t getAcceptSubgraph(int stg_id) {
+  reviewSubgraph_t getReviewSubgraph(int stg_id) {
     std::string prop(strategies[stg_id]);
-    if (accept_map.find(prop) != accept_map.end())
-      return accept_map[prop];
+    if (review_map.find(prop) != review_map.end())
+      return review_map[prop];
     else
       return nullptr;
   }
 
   /*! \brief partitioner  name */
   const char* name;
-  std::map<std::string, acceptSubgraph_t> accept_map;
+  std::map<std::string, reviewSubgraph_t> review_map;
   /*! \brief strategy names */
   std::vector<const char*> strategies;
   /*! \brief supported ops function */
@@ -908,7 +908,7 @@ typedef int (*partRegGetCount_t)(int idx, const char** name);
 
 #define MXLIB_PARTREGGET_STR "_partRegGet"
 typedef void (*partRegGet_t)(int part_idx, int stg_idx, const char** strategy,
-                             supportedOps_t* supportedOps, acceptSubgraph_t* acceptSubgraph,
+                             supportedOps_t* supportedOps, reviewSubgraph_t* reviewSubgraph,
                              const char** op_name);
 
 #define MXLIB_PARTCALLSUPPORTEDOPS_STR "_partCallSupportedOps"
@@ -916,8 +916,8 @@ typedef int (*partCallSupportedOps_t)(supportedOps_t supportedOps, const char *j
                                       int num_ids, int *ids, const char* const* opt_keys,
                                       const char* const* opt_vals, int num_opts);
 
-#define MXLIB_PARTCALLACCEPTSUBGRAPH_STR "_partCallAcceptSubgraph"
-typedef int (*partCallAcceptSubgraph_t)(acceptSubgraph_t acceptSubgraph, const char *json,
+#define MXLIB_PARTCALLREVIEWSUBGRAPH_STR "_partCallReviewSubgraph"
+typedef int (*partCallReviewSubgraph_t)(reviewSubgraph_t reviewSubgraph, const char *json,
                                         int subgraph_id, int *accept, const char* const* opt_keys,
                                         const char* const* opt_vals, int num_opts,
                                         char*** attr_keys, char*** attr_vals, int *num_attrs);
@@ -1248,12 +1248,12 @@ extern "C" {
   void
 #endif
   _partRegGet(int part_idx, int stg_idx, const char** strategy, supportedOps_t* supportedOps,
-              acceptSubgraph_t* acceptSubgraph, const char** op_name) {
+              reviewSubgraph_t* reviewSubgraph, const char** op_name) {
     CustomPartitioner part = Registry<CustomPartitioner>::get()->get(part_idx);
     *strategy = part.strategies[stg_idx];
     *supportedOps = part.supportedOps[stg_idx];
     *op_name = part.op_names[stg_idx];
-    *acceptSubgraph = part.getAcceptSubgraph(stg_idx);
+    *reviewSubgraph = part.getReviewSubgraph(stg_idx);
   }
 
   /*! \brief returns status of calling parse attributes function for operator from library */
@@ -1290,7 +1290,7 @@ extern "C" {
 #else
   int
 #endif
-  _partCallAcceptSubgraph(acceptSubgraph_t acceptSubgraph, const char *json,
+  _partCallReviewSubgraph(reviewSubgraph_t reviewSubgraph, const char *json,
                           int subgraph_id, int *accept, const char* const* opt_keys,
                           const char* const* opt_vals, int num_opts,
                           char*** attr_keys, char*** attr_vals, int *num_attrs) {
@@ -1305,7 +1305,7 @@ extern "C" {
     // attributes to set on subgraph node
     std::unordered_map<std::string, std::string> attrs;
 
-    MXReturnValue retval = acceptSubgraph(subgraph_json, subgraph_id, &accept_bool, opts, attrs);
+    MXReturnValue retval = reviewSubgraph(subgraph_json, subgraph_id, &accept_bool, opts, attrs);
     *accept = accept_bool;
 
     if (attrs.size() > 0) {
diff --git a/src/c_api/c_api.cc b/src/c_api/c_api.cc
index 7b58fea0fa09..31b9d843ef23 100644
--- a/src/c_api/c_api.cc
+++ b/src/c_api/c_api.cc
@@ -1,4 +1,3 @@
-
 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
@@ -277,8 +276,8 @@ int MXLoadLib(const char *path) {
     get_func<partCallSupportedOps_t>(lib, const_cast<char*>(MXLIB_PARTCALLSUPPORTEDOPS_STR));
 
 
-  partCallAcceptSubgraph_t callAcceptSubgraph =
-    get_func<partCallAcceptSubgraph_t>(lib, const_cast<char*>(MXLIB_PARTCALLACCEPTSUBGRAPH_STR));
+  partCallReviewSubgraph_t callReviewSubgraph =
+    get_func<partCallReviewSubgraph_t>(lib, const_cast<char*>(MXLIB_PARTCALLREVIEWSUBGRAPH_STR));
 
   // get number of operators registered in the library
   opRegSize_t opRegSize = get_func<opRegSize_t>(lib, const_cast<char*>(MXLIB_OPREGSIZE_STR));
@@ -844,12 +843,12 @@ int MXLoadLib(const char *path) {
       const char* strategy;
       // function pointers holding implementation from custom library
       supportedOps_t supportedOps_fp = nullptr;
-      acceptSubgraph_t acceptSubgraph_fp = nullptr;
+      reviewSubgraph_t reviewSubgraph_fp = nullptr;
       // name of subgraph op
       const char* op_name = nullptr;
 
     // get custom partitioner strategy from the dynamic library
-      partRegGet(i, j, &strategy, &supportedOps_fp, &acceptSubgraph_fp, &op_name);
+      partRegGet(i, j, &strategy, &supportedOps_fp, &reviewSubgraph_fp, &op_name);
       // validate custom partitioner functions from the dynamic library
       CHECK(supportedOps_fp != nullptr) << "Error loading '" << name
                                         << "' custom partitioner strategy '" << strategy
@@ -863,7 +862,7 @@ int MXLoadLib(const char *path) {
       mxnet::op::SubgraphBackendRegistry::Get()->__REGISTER_CUSTOM_PROPERTY__(name_str,
                             std::make_shared<mxnet::op::CustomSubgraphProperty>(
                            strategy_str, callSupportedOps, supportedOps_fp,
-                           callAcceptSubgraph, acceptSubgraph_fp, callFree, op_name_str));
+                           callReviewSubgraph, reviewSubgraph_fp, callFree, op_name_str));
     }
   }
   API_END();
diff --git a/src/operator/subgraph/partitioner/custom_subgraph_property.h b/src/operator/subgraph/partitioner/custom_subgraph_property.h
index 5d0629c25190..410d983fa591 100644
--- a/src/operator/subgraph/partitioner/custom_subgraph_property.h
+++ b/src/operator/subgraph/partitioner/custom_subgraph_property.h
@@ -72,21 +72,21 @@ class  CustomSubgraphProperty: public SubgraphProperty {
     subgraph_prop("error"),
     call_supported_ops_(nullptr),
     supported_ops_(nullptr),
-    call_accept_subgraph_(nullptr),
-    accept_subgraph_(nullptr),
+    call_review_subgraph_(nullptr),
+    review_subgraph_(nullptr),
     subgraph_op_name("error") {}
   CustomSubgraphProperty(std::string subgraph_prop_name,
                          partCallSupportedOps_t call_supported_ops,
                          supportedOps_t supported_ops,
-                         partCallAcceptSubgraph_t call_accept_subgraph,
-                         acceptSubgraph_t accept_subgraph,
+                         partCallReviewSubgraph_t call_review_subgraph,
+                         reviewSubgraph_t review_subgraph,
                          opCallFree_t call_free,
                          std::string op_name) :
       subgraph_prop(subgraph_prop_name),
       call_supported_ops_(call_supported_ops),
       supported_ops_(supported_ops),
-      call_accept_subgraph_(call_accept_subgraph),
-      accept_subgraph_(accept_subgraph),
+      call_review_subgraph_(call_review_subgraph),
+      review_subgraph_(review_subgraph),
       call_free_(call_free),
       subgraph_op_name(op_name) {}
 
@@ -167,7 +167,7 @@ class  CustomSubgraphProperty: public SubgraphProperty {
     int num_attr = 0;
     char** attr_keys = nullptr;
     char** attr_vals = nullptr;
-    if (accept_subgraph_) {
+    if (review_subgraph_) {
       nnvm::Graph g;
       g.outputs = sym.outputs;
       const auto& idx = g.indexed_graph();
@@ -188,11 +188,11 @@ class  CustomSubgraphProperty: public SubgraphProperty {
       }
 
       std::string subgraph_json = nnvm::pass::SaveJSON(g);
-      CHECK(call_accept_subgraph_(accept_subgraph_, subgraph_json.c_str(),
+      CHECK(call_review_subgraph_(review_subgraph_, subgraph_json.c_str(),
                                 subgraph_id, &accept, opt_keys_.data(),
                                 opt_vals_.data(), opt_keys_.size(),
                                 &attr_keys, &attr_vals, &num_attr))
-        << "Error calling accept_subgraph for '" << subgraph_prop << "'";
+        << "Error calling review_subgraph for '" << subgraph_prop << "'";
     }
     if (accept) {
       nnvm::ObjectPtr n = nnvm::Node::Create();
@@ -221,8 +221,8 @@ class  CustomSubgraphProperty: public SubgraphProperty {
   std::string subgraph_prop;
   partCallSupportedOps_t call_supported_ops_;
   supportedOps_t supported_ops_;
-  partCallAcceptSubgraph_t call_accept_subgraph_;
-  acceptSubgraph_t accept_subgraph_;
+  partCallReviewSubgraph_t call_review_subgraph_;
+  reviewSubgraph_t review_subgraph_;
   opCallFree_t call_free_;
   std::unordered_set<std::string> supported_nodes;
   std::string subgraph_op_name;

From 8596239ca46ed9523c39d805bdb6bd95cc3ff778 Mon Sep 17 00:00:00 2001
From: samskalicky <samskalicky@gmail.com>
Date: Wed, 19 Feb 2020 08:09:13 +0000
Subject: [PATCH 19/19] retrigger CI