From 4d614bd015205ff50cf7e2ccb7b81acd46431b84 Mon Sep 17 00:00:00 2001
From: acphile <phile_999@126.com>
Date: Wed, 24 Jun 2020 10:25:44 +0800
Subject: [PATCH] Update tutorials (#18609)

Update docs according to new Block APIs (#18413)
---
 docs/python_docs/python/api/gluon/index.rst   |   7 +-
 .../python/tutorials/extend/custom_layer.md   |  28 ++--
 .../python/tutorials/extend/customop.md       |   2 +-
 .../crash-course/6-use_gpus.md                |   4 +-
 .../gluon_from_experiment_to_deployment.md    |   3 +-
 .../logistic_regression_explained.md          |   9 +-
 .../getting-started/to-mxnet/pytorch.md       |   9 +-
 .../packages/gluon/blocks/custom-layer.md     |   4 +-
 .../gluon/blocks/custom_layer_beginners.md    |  28 ++--
 .../tutorials/packages/gluon/blocks/naming.md | 130 +++++-------------
 .../tutorials/packages/gluon/blocks/nn.md     |   3 +-
 .../packages/gluon/blocks/save_load_params.md |  33 ++---
 .../tutorials/packages/gluon/data/datasets.md |   7 +-
 .../packages/gluon/image/info_gan.md          |  82 ++++++-----
 .../tutorials/packages/gluon/image/mnist.md   |  22 ++-
 .../packages/gluon/image/pretrained_models.md |   3 +-
 .../packages/gluon/loss/custom-loss.md        |  16 +--
 .../packages/gluon/text/transformer.rst       |   4 +-
 .../learning_rates/learning_rate_schedules.md |  31 ++---
 .../packages/ndarray/sparse/train_gluon.md    |  30 ++--
 .../packages/onnx/fine_tuning_gluon.md        |   5 +-
 .../tutorials/performance/backend/amp.md      |   4 +-
 .../tutorials/performance/backend/profiler.md |  17 ++-
 23 files changed, 194 insertions(+), 287 deletions(-)

diff --git a/docs/python_docs/python/api/gluon/index.rst b/docs/python_docs/python/api/gluon/index.rst
index cf76ef42f5c2..abb9072bfbd9 100644
--- a/docs/python_docs/python/api/gluon/index.rst
+++ b/docs/python_docs/python/api/gluon/index.rst
@@ -33,10 +33,9 @@ one input layer, one hidden layer, and one output layer.
    # When instantiated, Sequential stores a chain of neural network layers.
    # Once presented with data, Sequential executes each layer in turn, using
    # the output of one layer as the input for the next
-   with net.name_scope():
-       net.add(gluon.nn.Dense(256, activation="relu")) # 1st layer (256 nodes)
-       net.add(gluon.nn.Dense(256, activation="relu")) # 2nd hidden layer
-       net.add(gluon.nn.Dense(num_outputs))
+   net.add(gluon.nn.Dense(256, activation="relu")) # 1st layer (256 nodes)
+   net.add(gluon.nn.Dense(256, activation="relu")) # 2nd hidden layer
+   net.add(gluon.nn.Dense(num_outputs))
 
 
 .. automodule:: mxnet.gluon
diff --git a/docs/python_docs/python/tutorials/extend/custom_layer.md b/docs/python_docs/python/tutorials/extend/custom_layer.md
index 72700c0b9738..8a6a2cb6c21a 100644
--- a/docs/python_docs/python/tutorials/extend/custom_layer.md
+++ b/docs/python_docs/python/tutorials/extend/custom_layer.md
@@ -111,10 +111,9 @@ Below is an example of how to create a simple neural network with a custom layer
 
 ```python
 net = gluon.nn.HybridSequential()                         # Define a Neural Network as a sequence of hybrid blocks
-with net.name_scope():                                    # Used to disambiguate saving and loading net parameters
-    net.add(Dense(5))                                     # Add Dense layer with 5 neurons
-    net.add(NormalizationHybridLayer())                   # Add our custom layer
-    net.add(Dense(1))                                     # Add Dense layer with 1 neurons
+net.add(Dense(5))                                     # Add Dense layer with 5 neurons
+net.add(NormalizationHybridLayer())                   # Add our custom layer
+net.add(Dense(1))                                     # Add Dense layer with 1 neurons
 
 
 net.initialize(mx.init.Xavier(magnitude=2.24))            # Initialize parameters of all layers
@@ -148,12 +147,11 @@ class NormalizationHybridLayer(gluon.HybridBlock):
     def __init__(self, hidden_units, scales):
         super(NormalizationHybridLayer, self).__init__()
 
-        with self.name_scope():
-            self.weights = self.params.get('weights',
-                                           shape=(hidden_units, 0),
-                                           allow_deferred_init=True)
+        self.weights = gluon.Parameter('weights',
+                                       shape=(hidden_units, 0),
+                                       allow_deferred_init=True)
 
-            self.scales = self.params.get('scales',
+        self.scales = gluon.Parameter('scales',
                                       shape=scales.shape,
                                       init=mx.init.Constant(scales.asnumpy().tolist()), # Convert to regular list to make this object serializable
                                       differentiable=False)
@@ -170,14 +168,13 @@ In the example above 2 set of parameters are defined:
 1. Parameter `scale` is a constant that doesn't change. Its shape is defined during construction.
 
 Notice a few aspects of this code:
-* `name_scope()` method is used to add a prefix to parameter names during saving and loading
 * Shape is not provided when creating `weights`. Instead it is going to be infered from the shape of the input
 * `Scales` parameter is initialized and marked as `differentiable=False`.
 * `F` backend is used for all calculations
 * The calculation of dot product is done using `F.FullyConnected()` method instead of `F.dot()` method. The one was chosen over another because the former supports automatic infering shapes of inputs while the latter doesn't. This is extremely important to know, if one doesn't want to hard code all the shapes. The best way to learn what operators supports automatic inference of input shapes at the moment is browsing C++ implementation of operators to see if one uses a method `SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param.num_hidden, num_input));`
 * `hybrid_forward()` method signature has changed. It accepts two new arguments: `weights` and `scales`.
 
-The last peculiarity is due to support of imperative and symbolic programming by `HybridBlock`. During training phase, parameters are passed to the layer by Apache MxNet framework as additional arguments to the method, because they might need to be converted to a `Symbol` depending on if the layer was hybridized. One shouldn't use `self.weights` and `self.scales` or `self.params.get` in `hybrid_forward` except to get shapes of parameters. 
+The last peculiarity is due to support of imperative and symbolic programming by `HybridBlock`. During training phase, parameters are passed to the layer by Apache MxNet framework as additional arguments to the method, because they might need to be converted to a `Symbol` depending on if the layer was hybridized. One shouldn't use `self.weights` and `self.scales` in `hybrid_forward` except to get shapes of parameters. 
 
 Running forward pass on this network is very similar to the previous example, so instead of just doing one forward pass, let's run whole training for a few epochs to show that `scales` parameter doesn't change during the training while `weights` parameter is changing.
 
@@ -194,11 +191,10 @@ def print_params(title, net):
         print('{} = {}\n'.format(key, value.data()))
 
 net = gluon.nn.HybridSequential()                             # Define a Neural Network as a sequence of hybrid blocks
-with net.name_scope():                                        # Used to disambiguate saving and loading net parameters
-    net.add(Dense(5))                                         # Add Dense layer with 5 neurons
-    net.add(NormalizationHybridLayer(hidden_units=5, 
-                                     scales = nd.array([2]))) # Add our custom layer
-    net.add(Dense(1))                                         # Add Dense layer with 1 neurons
+net.add(Dense(5))                                         # Add Dense layer with 5 neurons
+net.add(NormalizationHybridLayer(hidden_units=5, 
+                                    scales = nd.array([2]))) # Add our custom layer
+net.add(Dense(1))                                         # Add Dense layer with 1 neurons
 
 
 net.initialize(mx.init.Xavier(magnitude=2.24))                # Initialize parameters of all layers
diff --git a/docs/python_docs/python/tutorials/extend/customop.md b/docs/python_docs/python/tutorials/extend/customop.md
index cc9cff598ee9..f1ee1d2ae601 100644
--- a/docs/python_docs/python/tutorials/extend/customop.md
+++ b/docs/python_docs/python/tutorials/extend/customop.md
@@ -197,7 +197,7 @@ class DenseBlock(mx.gluon.Block):
     def __init__(self, in_channels, channels, bias, **kwargs):
         super(DenseBlock, self).__init__(**kwargs)
         self._bias = bias
-        self.weight = self.params.get('weight', shape=(channels, in_channels))
+        self.weight = gluon.Parameter('weight', shape=(channels, in_channels))
 
     def forward(self, x):
         ctx = x.context
diff --git a/docs/python_docs/python/tutorials/getting-started/crash-course/6-use_gpus.md b/docs/python_docs/python/tutorials/getting-started/crash-course/6-use_gpus.md
index a0788ba7df2d..fc457ea7dc33 100644
--- a/docs/python_docs/python/tutorials/getting-started/crash-course/6-use_gpus.md
+++ b/docs/python_docs/python/tutorials/getting-started/crash-course/6-use_gpus.md
@@ -82,7 +82,7 @@ net.add(nn.Conv2D(channels=6, kernel_size=5, activation='relu'),
         nn.Dense(10))
 ```
 
-And then load the saved parameters into GPU 0 directly, or use `net.collect_params().reset_ctx` to change the device.
+And then load the saved parameters into GPU 0 directly, or use `net.reset_ctx` to change the device.
 
 ```{.python .input  n=20}
 net.load_parameters('net.params', ctx=gpu(0))
@@ -120,7 +120,7 @@ The training loop is quite similar to what we introduced before. The major diffe
 # Diff 1: Use two GPUs for training.
 devices = [gpu(0), gpu(1)]
 # Diff 2: reinitialize the parameters and place them on multiple GPUs
-net.collect_params().initialize(force_reinit=True, ctx=devices)
+net.initialize(force_reinit=True, ctx=devices)
 # Loss and trainer are the same as before
 softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
 trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1})
diff --git a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
index 7fb4d48157f6..bd9dbacf3e97 100644
--- a/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
+++ b/docs/python_docs/python/tutorials/getting-started/gluon_from_experiment_to_deployment.md
@@ -170,8 +170,7 @@ Before we go to training, one unique Gluon feature you should be aware of is hyb
 finetune_net = resnet50_v2(pretrained=True, ctx=ctx)
 
 # change last softmax layer since number of classes are different
-with finetune_net.name_scope():
-    finetune_net.output = nn.Dense(classes)
+finetune_net.output = nn.Dense(classes)
 finetune_net.output.initialize(init.Xavier(), ctx=ctx)
 # hybridize for better performance
 finetune_net.hybridize()
diff --git a/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md b/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md
index 61ee25784817..277aa5d2d82c 100644
--- a/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md
+++ b/docs/python_docs/python/tutorials/getting-started/logistic_regression_explained.md
@@ -80,11 +80,10 @@ Below, we define a model which has an input layer of 10 neurons, a couple of inn
 ```python
 net = nn.HybridSequential()
 
-with net.name_scope():
-    net.add(nn.Dense(units=10, activation='relu'))  # input layer
-    net.add(nn.Dense(units=10, activation='relu'))   # inner layer 1
-    net.add(nn.Dense(units=10, activation='relu'))   # inner layer 2
-    net.add(nn.Dense(units=1))   # output layer: notice, it must have only 1 neuron
+net.add(nn.Dense(units=10, activation='relu'))  # input layer
+net.add(nn.Dense(units=10, activation='relu'))   # inner layer 1
+net.add(nn.Dense(units=10, activation='relu'))   # inner layer 2
+net.add(nn.Dense(units=1))   # output layer: notice, it must have only 1 neuron
 
 net.initialize(mx.init.Xavier())
 ```
diff --git a/docs/python_docs/python/tutorials/getting-started/to-mxnet/pytorch.md b/docs/python_docs/python/tutorials/getting-started/to-mxnet/pytorch.md
index ec4bdfcdc77e..aada149a6632 100644
--- a/docs/python_docs/python/tutorials/getting-started/to-mxnet/pytorch.md
+++ b/docs/python_docs/python/tutorials/getting-started/to-mxnet/pytorch.md
@@ -342,13 +342,12 @@ Apache MXNet uses lazy evaluation to achieve superior performance. The Python th
 
 ## PyTorch module and Gluon blocks
 
-### For new block definition, gluon needs name_scope
+### For new block definition, gluon is similar to PyTorch
 
-`name_scope` coerces Gluon to give each parameter an appropriate name, indicating which model it belongs to.
 
 | Function               | PyTorch                           | MXNet Gluon                                                                |
 |------------------------|-----------------------------------|----------------------------------------------------------------------------|
-| New block definition   | `class Net(torch.nn.Module):`<br/>&nbsp;&nbsp;&nbsp;&nbsp;`def __init__(self, D_in, D_out):`<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`super(Net, self).__init__()`<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`self.linear = torch.nn.Linear(D_in, D_out)`<br/>&nbsp;&nbsp;&nbsp;&nbsp;`def forward(self, x):`<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`return self.linear(x)`       |    `class Net(mx.gluon.Block):`<br/>&nbsp;&nbsp;&nbsp;&nbsp;`def __init__(self, D_in, D_out):`<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`super(Net, self).__init__()`<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`with self.name_scope():`<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`self.dense=mx.gluon.nn.Dense(D_out, in_units=D_in)`<br/>&nbsp;&nbsp;&nbsp;&nbsp;`def forward(self, x):`<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`return self.dense(x)`      |
+| New block definition   | `class Net(torch.nn.Module):`<br/>&nbsp;&nbsp;&nbsp;&nbsp;`def __init__(self, D_in, D_out):`<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`super(Net, self).__init__()`<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`self.linear = torch.nn.Linear(D_in, D_out)`<br/>&nbsp;&nbsp;&nbsp;&nbsp;`def forward(self, x):`<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`return self.linear(x)`       |    `class Net(mx.gluon.Block):`<br/>&nbsp;&nbsp;&nbsp;&nbsp;`def __init__(self, D_in, D_out):`<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`super(Net, self).__init__()`<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`self.dense=mx.gluon.nn.Dense(D_out, in_units=D_in)`<br/>&nbsp;&nbsp;&nbsp;&nbsp;`def forward(self, x):`<br/>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;`return self.dense(x)`      |
 
 ### Parameter and Initializer
 
@@ -374,7 +373,7 @@ Instead of explicitly declaring the number of inputs to a layer, we can simply s
 
 | Function               | PyTorch                           | MXNet Gluon                                                                |
 |------------------------|-----------------------------------|----------------------------------------------------------------------------|
-| partial-shape  <br/> hybridized    |  Not Available   |  `net = mx.gluon.nn.HybridSequential()`<br/>`with net.name_scope():`<br/>&nbsp;&nbsp;&nbsp;&nbsp;`net.add(mx.gluon.nn.Dense(10))`<br/>`net.hybridize()`   |
+| partial-shape  <br/> hybridized    |  Not Available   |  `net = mx.gluon.nn.HybridSequential()`<br/>`net.add(mx.gluon.nn.Dense(10))`<br/>`net.hybridize()`   |
 
 ### SymbolBlock
 
@@ -382,7 +381,7 @@ SymbolBlock can construct block from symbol. This is useful for using pre-traine
 
 | Function               | PyTorch                           | MXNet Gluon                                                                |
 |------------------------|-----------------------------------|----------------------------------------------------------------------------|
-|  SymbolBlock    |  Not Available   |  `alexnet = mx.gluon.model_zoo.vision.alexnet(pretrained=True, prefix='model_')`<br/>`out = alexnet(inputs)`<br/>`internals = out.get_internals()`<br/>`outputs = [internals['model_dense0_relu_fwd_output']]`<br/>`feat_model = gluon.SymbolBlock(outputs, inputs, params=alexnet.collect_params())`   |
+|  SymbolBlock    |  Not Available   |  `alexnet = mx.gluon.model_zoo.vision.alexnet(pretrained=True)`<br/>`out = alexnet(inputs)`<br/>`internals = out.get_internals()`<br/>`outputs = [internals['model_dense0_relu_fwd_output']]`<br/>`feat_model = gluon.SymbolBlock(outputs, inputs, params=alexnet.collect_params())`   |
 
 ## PyTorch optimizer vs Gluon Trainer
 ### For Gluon API calling zero_grad is not necessary most of the time
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md
index 295bd61ade71..48b4491a1532 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/custom-layer.md
@@ -91,8 +91,8 @@ class MyDense(nn.Block):
         # in_units: the number of inputs in this layer
 
         super(MyDense, self).__init__(**kwargs)
-        self.weight = self.params.get('weight', shape=(in_units, units))
-        self.bias = self.params.get('bias', shape=(units,))
+        self.weight = gluon.Parameter('weight', shape=(in_units, units))
+        self.bias = gluon.Parameter('bias', shape=(units,))
 
     def forward(self, x):
         linear = nd.dot(x, self.weight.data()) + self.bias.data()
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/custom_layer_beginners.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/custom_layer_beginners.md
index 933a70bbdf2c..99fed59678ca 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/custom_layer_beginners.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/custom_layer_beginners.md
@@ -102,10 +102,9 @@ Below is an example of how to create a simple neural network with a custom layer
 
 ```python
 net = gluon.nn.HybridSequential()                         # Define a Neural Network as a sequence of hybrid blocks
-with net.name_scope():                                    # Used to disambiguate saving and loading net parameters
-    net.add(Dense(5))                                     # Add Dense layer with 5 neurons
-    net.add(NormalizationHybridLayer())                   # Add our custom layer
-    net.add(Dense(1))                                     # Add Dense layer with 1 neurons
+net.add(Dense(5))                                     # Add Dense layer with 5 neurons
+net.add(NormalizationHybridLayer())                   # Add our custom layer
+net.add(Dense(1))                                     # Add Dense layer with 1 neurons
 
 
 net.initialize(mx.init.Xavier(magnitude=2.24))            # Initialize parameters of all layers
@@ -134,12 +133,11 @@ class NormalizationHybridLayer(gluon.HybridBlock):
     def __init__(self, hidden_units, scales):
         super(NormalizationHybridLayer, self).__init__()
 
-        with self.name_scope():
-            self.weights = self.params.get('weights',
-                                           shape=(hidden_units, 0),
-                                           allow_deferred_init=True)
+        self.weights = gluon.Parameter('weights',
+                                       shape=(hidden_units, 0),
+                                       allow_deferred_init=True)
 
-            self.scales = self.params.get('scales',
+        self.scales = gluon.Parameter('scales',
                                       shape=scales.shape,
                                       init=mx.init.Constant(scales.asnumpy()),
                                       differentiable=False)
@@ -157,14 +155,13 @@ In the example above 2 set of parameters are defined:
 
 Notice a few aspects of this code:
 
-+ `name_scope()` method is used to add a prefix to parameter names during saving and loading
 + Shape is not provided when creating `weights`. Instead it is going to be infered from the shape of the input
 + `Scales` parameter is initialized and marked as `differentiable=False`.
 + `F` backend is used for all calculations
 + The calculation of dot product is done using `F.FullyConnected()` method instead of `F.dot()` method. The one was chosen over another because the former supports automatic infering shapes of inputs while the latter doesn’t. This is extremely important to know, if one doesn’t want to hard code all the shapes. The best way to learn what operators supports automatic inference of input shapes at the moment is browsing C++ implementation of operators to see if one uses a method `SHAPE_ASSIGN_CHECK(*in_shape, fullc::kWeight, Shape2(param.num_hidden, num_input));`
 + `hybrid_forward()` method signature has changed. It accepts two new arguments: `weights` and `scales`.
 
-The last peculiarity is due to support of imperative and symbolic programming by `HybridBlock`. During training phase, parameters are passed to the layer by Apache MxNet framework as additional arguments to the method, because they might need to be converted to a `Symbol` depending on if the layer was hybridized. One shouldn’t use `self.weights` and `self.scales` or `self.params.get` in `hybrid_forward` except to get shapes of parameters.
+The last peculiarity is due to support of imperative and symbolic programming by `HybridBlock`. During training phase, parameters are passed to the layer by Apache MxNet framework as additional arguments to the method, because they might need to be converted to a `Symbol` depending on if the layer was hybridized. One shouldn’t use `self.weights` and `self.scales` in `hybrid_forward` except to get shapes of parameters.
 
 Running forward pass on this network is very similar to the previous example, so instead of just doing one forward pass, let’s run whole training for a few epochs to show that `scales` parameter doesn’t change during the training while `weights` parameter is changing.
 
@@ -180,11 +177,10 @@ def print_params(title, net):
         print('{} = {}\n'.format(key, value.data()))
 
 net = gluon.nn.HybridSequential()                             # Define a Neural Network as a sequence of hybrid blocks
-with net.name_scope():                                        # Used to disambiguate saving and loading net parameters
-    net.add(Dense(5))                                         # Add Dense layer with 5 neurons
-    net.add(NormalizationHybridLayer(hidden_units=5,
-                                     scales = nd.array([2]))) # Add our custom layer
-    net.add(Dense(1))                                         # Add Dense layer with 1 neurons
+net.add(Dense(5))                                         # Add Dense layer with 5 neurons
+net.add(NormalizationHybridLayer(hidden_units=5,
+                                    scales = nd.array([2]))) # Add our custom layer
+net.add(Dense(1))                                         # Add Dense layer with 1 neurons
 
 
 net.initialize(mx.init.Xavier(magnitude=2.24))                # Initialize parameters of all layers
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/naming.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/naming.md
index 24bd7d9b0f2c..6f98a2f6b2ce 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/naming.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/naming.md
@@ -17,7 +17,7 @@
 
 # Parameter and Block Naming
 
-In gluon, each Parameter or Block has a name (and prefix). Parameter names are specified by users and Block names can be either specified by users or automatically created.
+In gluon, each Parameter or Block has a name. Parameter names and Block names can be automatically created.
 
 In this tutorial we talk about the best practices on naming. First, let's import MXNet and Gluon:
 
@@ -30,144 +30,79 @@ from mxnet import gluon
 
 ## Naming Blocks
 
-When creating a block, you can assign a prefix to it:
+When creating a block, you can simply do as follows:
 
 
 ```python
-mydense = gluon.nn.Dense(100, prefix='mydense_')
-print(mydense.prefix)
+mydense = gluon.nn.Dense(100)
+print(mydense.name)
 ```
 
-When no prefix is given, Gluon will automatically generate one:
+When you create more Blocks of the same kind, they will be named with incrementing suffixes to avoid collision:
 
 
 ```python
-dense0 = gluon.nn.Dense(100)
-print(dense0.prefix)
+dense1 = gluon.nn.Dense(100)
+print(dense1.name)
 ```
 
-When you create more Blocks of the same kind, they will be named with incrementing suffixes to avoid collision:
+## Naming Parameters
+
+Parameters will be named automatically by a unique name in the format of `param_{uuid4}_{name}`:
 
 
 ```python
-dense1 = gluon.nn.Dense(100)
-print(dense1.prefix)
+param = gluon.Parameter(name = 'bias')
+print(param.name)
 ```
 
-## Naming Parameters
+`param.name` is used as the name of a parameter's symbol representation. And it can not be changed once the parameter is created.
 
-Parameters within a Block will be named by prepending the prefix of the Block to the name of the Parameter:
+When getting parameters within a Block, you should use the structure based name as the key:
 
 
 ```python
 print(dense0.collect_params())
 ```
 
-## Name scopes
-
-To manage the names of nested Blocks, each Block has a `name_scope` attached to it. All Blocks created within a name scope will have its parent Block's prefix prepended to its name.
+## Nested Blocks
 
-Let's demonstrate this by first defining a simple neural net:
+In MXNet 2, we don't have to define children blocks within a `name_scope` any more. Let's demonstrate this by defining and initiating a simple neural net:
 
 
 ```python
-class Model(gluon.Block):
-    def __init__(self, **kwargs):
-        super(Model, self).__init__(**kwargs)
-        with self.name_scope():
-            self.dense0 = gluon.nn.Dense(20)
-            self.dense1 = gluon.nn.Dense(20)
-            self.mydense = gluon.nn.Dense(20, prefix='mydense_')
+class Model(gluon.HybridBlock):
+    def __init__(self):
+        super(Model, self).__init__()
+        self.dense0 = gluon.nn.Dense(20)
+        self.dense1 = gluon.nn.Dense(20)
+        self.mydense = gluon.nn.Dense(20)
 
     def forward(self, x):
         x = mx.nd.relu(self.dense0(x))
         x = mx.nd.relu(self.dense1(x))
         return mx.nd.relu(self.mydense(x))
-```
-
-Now let's instantiate our neural net.
-
-- Note that `model0.dense0` is named as `model0_dense0_` instead of `dense0_`.
 
-- Also note that although we specified `mydense_` as prefix for `model.mydense`, its parent's prefix is automatically prepended to generate the prefix `model0_mydense_`.
-
-
-```python
 model0 = Model()
 model0.initialize()
+model0.hybridize()
 model0(mx.nd.zeros((1, 20)))
-print(model0.prefix)
-print(model0.dense0.prefix)
-print(model0.dense1.prefix)
-print(model0.mydense.prefix)
 ```
 
-
-If we instantiate `Model` again, it will be given a different name like shown before for `Dense`.
-
-- Note that `model1.dense0` is still named as `dense0_` instead of `dense2_`, following dense layers in previously created `model0`. This is because each instance of model's name scope is independent of each other.
-
-
-```python
-model1 = Model()
-print(model1.prefix)
-print(model1.dense0.prefix)
-print(model1.dense1.prefix)
-print(model1.mydense.prefix)
-```
-
-**It is recommended that you manually specify a prefix for the top level Block, i.e. `model = Model(prefix='mymodel_')`, to avoid potential confusions in naming.**
-
-The same principle also applies to container blocks like Sequential. `name_scope` can be used inside `__init__` as well as out side of `__init__`:
+The same principle also applies to container blocks like Sequential. We can simply do as follows:
 
 
 ```python
 net = gluon.nn.Sequential()
-with net.name_scope():
-    net.add(gluon.nn.Dense(20))
-    net.add(gluon.nn.Dense(20))
-print(net.prefix)
-print(net[0].prefix)
-print(net[1].prefix)
-```
-
-`gluon.model_zoo` also behaves similarly:
-
-
-```python
-net = gluon.nn.Sequential()
-with net.name_scope():
-    net.add(gluon.model_zoo.vision.alexnet(pretrained=True))
-    net.add(gluon.model_zoo.vision.alexnet(pretrained=True))
-print(net.prefix, net[0].prefix, net[1].prefix)
+net.add(gluon.nn.Dense(20))
+net.add(gluon.nn.Dense(20))
 ```
 
 
 ## Saving and loading
 
-Because model0 and model1 have different prefixes, their parameters also have different names:
-
-
-```python
-print(model0.collect_params(), '\n')
-print(model1.collect_params())
-```
-
 
-As a result, if you try to save parameters from model0 and load it with model1, you'll get an error due to unmatching names:
-
-
-```python
-model0.collect_params().save('model.params')
-try:
-    model1.collect_params().load('model.params', mx.cpu())
-except Exception as e:
-    print(e)
-```
-
-
-
-To solve this problem, we use `save_parameters`/`load_parameters` instead of `collect_params` and `save`/`load`. `save_parameters` uses model structure, instead of parameter name, to match parameters.
+For `HybridBlock`, we use `save_parameters`/`load_parameters`, which uses model structure, instead of parameter name, to match parameters.
 
 
 ```python
@@ -176,7 +111,12 @@ model1.load_parameters('model.params')
 print(mx.nd.load('model.params').keys())
 ```
 
+For `SymbolBlock.imports`, we use `export`, which uses parameter name `param.name`, to save parameters.
 
+```python
+model0.export('model0')
+model2 = gluon.SymbolBlock.imports('model0-symbol.json', ['data'], 'model0-0000.params')
+```
 
 ## Replacing Blocks from networks and fine-tuning
 
@@ -193,7 +133,6 @@ To see how to do this, we first load a pretrained AlexNet.
 ```python
 alexnet = gluon.model_zoo.vision.alexnet(pretrained=True)
 print(alexnet.output)
-print(alexnet.output.prefix)
 ```
 
 
@@ -201,9 +140,6 @@ To change the output to 100 dimension, we replace it with a new block.
 
 
 ```python
-with alexnet.name_scope():
-    alexnet.output = gluon.nn.Dense(100)
+alexnet.output = gluon.nn.Dense(100)
 alexnet.output.initialize()
-print(alexnet.output)
-print(alexnet.output.prefix)
 ```
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/nn.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/nn.md
index 337ae8fe71bf..323aa136f220 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/nn.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/nn.md
@@ -37,6 +37,7 @@ perceptron. A common strategy would be to construct a two-layer network as
 follows:
 
 ```{.python .input  n=1}
+import mxnet as mx
 from mxnet import nd
 from mxnet.gluon import nn
 
@@ -226,7 +227,7 @@ class FancyMLP(nn.Block):
 
         # Random weight parameters created with the get_constant are not
         # iterated during training (i.e. constant parameters).
-        self.rand_weight = self.params.get_constant(
+        self.rand_weight = mx.gluon.Constant(
             'rand_weight', nd.random.uniform(shape=(20, 20)))
         self.dense = nn.Dense(20, activation='relu')
 
diff --git a/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md b/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md
index 05153034778f..38f3b5dae159 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/blocks/save_load_params.md
@@ -65,26 +65,25 @@ train_data = gluon.data.DataLoader(gluon.data.vision.MNIST(train=True).transform
 
 # Build a simple convolutional network
 def build_lenet(net):    
-    with net.name_scope():
-        # First convolution
-        net.add(gluon.nn.Conv2D(channels=20, kernel_size=5, activation='relu'))
-        net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
-        # Second convolution
-        net.add(gluon.nn.Conv2D(channels=50, kernel_size=5, activation='relu'))
-        net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
-        # Flatten the output before the fully connected layers
-        net.add(gluon.nn.Flatten())
-        # First fully connected layers with 512 neurons
-        net.add(gluon.nn.Dense(512, activation="relu"))
-        # Second fully connected layer with as many neurons as the number of classes
-        net.add(gluon.nn.Dense(num_outputs))
-
-        return net
+    # First convolution
+    net.add(gluon.nn.Conv2D(channels=20, kernel_size=5, activation='relu'))
+    net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
+    # Second convolution
+    net.add(gluon.nn.Conv2D(channels=50, kernel_size=5, activation='relu'))
+    net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
+    # Flatten the output before the fully connected layers
+    net.add(gluon.nn.Flatten())
+    # First fully connected layers with 512 neurons
+    net.add(gluon.nn.Dense(512, activation="relu"))
+    # Second fully connected layer with as many neurons as the number of classes
+    net.add(gluon.nn.Dense(num_outputs))
+
+    return net
 
 # Train a given model using MNIST data
 def train_model(model):
     # Initialize the parameters with Xavier initializer
-    model.collect_params().initialize(mx.init.Xavier(), ctx=ctx)
+    model.initialize(mx.init.Xavier(), ctx=ctx)
     # Use cross entropy loss
     softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
     # Use Adam optimizer
@@ -152,8 +151,6 @@ net.save_parameters(file_name)
 
 We have successfully saved the parameters of the model into a file.
 
-Note: `Block.collect_params().save()` is not a recommended way to save parameters of a Gluon network if you plan to load the parameters back into a Gluon network using `Block.load_parameters()`.
-
 ## Loading model parameters from file
 
 Let's now create a network with the parameters we saved into the file. We build the network again using the helper first and then load the weights from the file we saved using the `load_parameters` function.
diff --git a/docs/python_docs/python/tutorials/packages/gluon/data/datasets.md b/docs/python_docs/python/tutorials/packages/gluon/data/datasets.md
index 5e7ce13cb16a..bd291dd92253 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/data/datasets.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/data/datasets.md
@@ -150,10 +150,9 @@ from mxnet import gluon, autograd, ndarray
 
 def construct_net():
     net = gluon.nn.HybridSequential()
-    with net.name_scope():
-        net.add(gluon.nn.Dense(128, activation="relu"))
-        net.add(gluon.nn.Dense(64, activation="relu"))
-        net.add(gluon.nn.Dense(10))
+    net.add(gluon.nn.Dense(128, activation="relu"))
+    net.add(gluon.nn.Dense(64, activation="relu"))
+    net.add(gluon.nn.Dense(10))
     return net
 
 # construct and initialize network.
diff --git a/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md b/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md
index 646cf728e5ef..463ee341e7c4 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/image/info_gan.md
@@ -120,25 +120,24 @@ Define the Generator model. Architecture is taken from the DCGAN implementation
 class Generator(gluon.HybridBlock):
     def __init__(self, **kwargs):
         super(Generator, self).__init__(**kwargs)
-        with self.name_scope():
-            self.prev = nn.HybridSequential()
-            self.prev.add(nn.Dense(1024, use_bias=False), nn.BatchNorm(), nn.Activation(activation='relu'))
-            self.G = nn.HybridSequential()
-         
-            self.G.add(nn.Conv2DTranspose(64 * 8, 4, 1, 0, use_bias=False))
-            self.G.add(nn.BatchNorm())
-            self.G.add(nn.Activation('relu'))
-            self.G.add(nn.Conv2DTranspose(64 * 4, 4, 2, 1, use_bias=False))
-            self.G.add(nn.BatchNorm())
-            self.G.add(nn.Activation('relu'))
-            self.G.add(nn.Conv2DTranspose(64 * 2, 4, 2, 1, use_bias=False))
-            self.G.add(nn.BatchNorm())
-            self.G.add(nn.Activation('relu'))
-            self.G.add(nn.Conv2DTranspose(64, 4, 2, 1, use_bias=False))
-            self.G.add(nn.BatchNorm())
-            self.G.add(nn.Activation('relu'))
-            self.G.add(nn.Conv2DTranspose(3, 4, 2, 1, use_bias=False))
-            self.G.add(nn.Activation('tanh'))
+        self.prev = nn.HybridSequential()
+        self.prev.add(nn.Dense(1024, use_bias=False), nn.BatchNorm(), nn.Activation(activation='relu'))
+        self.G = nn.HybridSequential()
+        
+        self.G.add(nn.Conv2DTranspose(64 * 8, 4, 1, 0, use_bias=False))
+        self.G.add(nn.BatchNorm())
+        self.G.add(nn.Activation('relu'))
+        self.G.add(nn.Conv2DTranspose(64 * 4, 4, 2, 1, use_bias=False))
+        self.G.add(nn.BatchNorm())
+        self.G.add(nn.Activation('relu'))
+        self.G.add(nn.Conv2DTranspose(64 * 2, 4, 2, 1, use_bias=False))
+        self.G.add(nn.BatchNorm())
+        self.G.add(nn.Activation('relu'))
+        self.G.add(nn.Conv2DTranspose(64, 4, 2, 1, use_bias=False))
+        self.G.add(nn.BatchNorm())
+        self.G.add(nn.Activation('relu'))
+        self.G.add(nn.Conv2DTranspose(3, 4, 2, 1, use_bias=False))
+        self.G.add(nn.Activation('tanh'))
 
     def hybrid_forward(self, F, x):
         x = self.prev(x)
@@ -154,29 +153,28 @@ Define the Discriminator and Q model. The Q model shares many layers with the Di
 class Discriminator(gluon.HybridBlock):
     def __init__(self, **kwargs):
         super(Discriminator, self).__init__(**kwargs)
-        with self.name_scope():
-            self.D = nn.HybridSequential()
-            self.D.add(nn.Conv2D(64, 4, 2, 1, use_bias=False))
-            self.D.add(nn.LeakyReLU(0.2))
-            self.D.add(nn.Conv2D(64 * 2, 4, 2, 1, use_bias=False))
-            self.D.add(nn.BatchNorm())
-            self.D.add(nn.LeakyReLU(0.2))
-            self.D.add(nn.Conv2D(64 * 4, 4, 2, 1, use_bias=False))
-            self.D.add(nn.BatchNorm())
-            self.D.add(nn.LeakyReLU(0.2))
-            self.D.add(nn.Conv2D(64 * 8, 4, 2, 1, use_bias=False))
-            self.D.add(nn.BatchNorm())
-            self.D.add(nn.LeakyReLU(0.2))
-
-            self.D.add(nn.Dense(1024, use_bias=False), nn.BatchNorm(), nn.Activation(activation='relu'))
-       
-            self.prob = nn.Dense(1)
-            self.feat = nn.HybridSequential()
-            self.feat.add(nn.Dense(128, use_bias=False), nn.BatchNorm(), nn.Activation(activation='relu'))
-            self.category_prob = nn.Dense(n_categories)
-            self.continuous_mean = nn.Dense(n_continuous)
-            self.Q = nn.HybridSequential()
-            self.Q.add(self.feat, self.category_prob, self.continuous_mean)
+        self.D = nn.HybridSequential()
+        self.D.add(nn.Conv2D(64, 4, 2, 1, use_bias=False))
+        self.D.add(nn.LeakyReLU(0.2))
+        self.D.add(nn.Conv2D(64 * 2, 4, 2, 1, use_bias=False))
+        self.D.add(nn.BatchNorm())
+        self.D.add(nn.LeakyReLU(0.2))
+        self.D.add(nn.Conv2D(64 * 4, 4, 2, 1, use_bias=False))
+        self.D.add(nn.BatchNorm())
+        self.D.add(nn.LeakyReLU(0.2))
+        self.D.add(nn.Conv2D(64 * 8, 4, 2, 1, use_bias=False))
+        self.D.add(nn.BatchNorm())
+        self.D.add(nn.LeakyReLU(0.2))
+
+        self.D.add(nn.Dense(1024, use_bias=False), nn.BatchNorm(), nn.Activation(activation='relu'))
+    
+        self.prob = nn.Dense(1)
+        self.feat = nn.HybridSequential()
+        self.feat.add(nn.Dense(128, use_bias=False), nn.BatchNorm(), nn.Activation(activation='relu'))
+        self.category_prob = nn.Dense(n_categories)
+        self.continuous_mean = nn.Dense(n_continuous)
+        self.Q = nn.HybridSequential()
+        self.Q.add(self.feat, self.category_prob, self.continuous_mean)
 
     def hybrid_forward(self, F, x):
         x               = self.D(x)
diff --git a/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md b/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
index 39726a3a511c..f18ec1a2357f 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/image/mnist.md
@@ -99,10 +99,9 @@ To do this, we will use [Sequential layer](https://mxnet.io/api/python/docs/api/
 ```python
 # define network
 net = nn.Sequential()
-with net.name_scope():
-    net.add(nn.Dense(128, activation='relu'))
-    net.add(nn.Dense(64, activation='relu'))
-    net.add(nn.Dense(10))
+net.add(nn.Dense(128, activation='relu'))
+net.add(nn.Dense(64, activation='relu'))
+net.add(nn.Dense(10))
 ```
 
 #### Initialize parameters and optimizer
@@ -225,15 +224,12 @@ import mxnet.ndarray as F
 class Net(gluon.Block):
     def __init__(self, **kwargs):
         super(Net, self).__init__(**kwargs)
-        with self.name_scope():
-            # layers created in name_scope will inherit name space
-            # from parent layer.
-            self.conv1 = nn.Conv2D(20, kernel_size=(5,5))
-            self.pool1 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2))
-            self.conv2 = nn.Conv2D(50, kernel_size=(5,5))
-            self.pool2 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2))
-            self.fc1 = nn.Dense(500)
-            self.fc2 = nn.Dense(10)
+        self.conv1 = nn.Conv2D(20, kernel_size=(5,5))
+        self.pool1 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2))
+        self.conv2 = nn.Conv2D(50, kernel_size=(5,5))
+        self.pool2 = nn.MaxPool2D(pool_size=(2,2), strides = (2,2))
+        self.fc1 = nn.Dense(500)
+        self.fc2 = nn.Dense(10)
 
     def forward(self, x):
         x = self.pool1(F.tanh(self.conv1(x)))
diff --git a/docs/python_docs/python/tutorials/packages/gluon/image/pretrained_models.md b/docs/python_docs/python/tutorials/packages/gluon/image/pretrained_models.md
index fca73ad46aff..fea7fda6b38f 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/image/pretrained_models.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/image/pretrained_models.md
@@ -224,8 +224,7 @@ You can replace the output layer of your pre-trained model to fit the right numb
 
 ```python
 NUM_CLASSES = 10
-with resnet18.name_scope():
-    resnet18.output = gluon.nn.Dense(NUM_CLASSES)
+resnet18.output = gluon.nn.Dense(NUM_CLASSES)
 ```
 
 
diff --git a/docs/python_docs/python/tutorials/packages/gluon/loss/custom-loss.md b/docs/python_docs/python/tutorials/packages/gluon/loss/custom-loss.md
index 9210e8d0fb00..1cae0c91b837 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/loss/custom-loss.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/loss/custom-loss.md
@@ -75,15 +75,13 @@ Our network consists of 2 convolutional and max pooling layers that downsample t
 class Siamese(gluon.HybridBlock):
     def __init__(self, **kwargs):
         super(Siamese, self).__init__(**kwargs)
-        with self.name_scope():
-            self.cnn = gluon.nn.HybridSequential()
-            with self.cnn.name_scope():
-                self.cnn.add(gluon.nn.Conv2D(64, 5, activation='relu'))
-                self.cnn.add(gluon.nn.MaxPool2D(2, 2))
-                self.cnn.add(gluon.nn.Conv2D(64, 5, activation='relu'))
-                self.cnn.add(gluon.nn.MaxPool2D(2, 2))
-                self.cnn.add(gluon.nn.Dense(256, activation='relu'))
-                self.cnn.add(gluon.nn.Dense(2, activation='softrelu'))
+        self.cnn = gluon.nn.HybridSequential()
+        self.cnn.add(gluon.nn.Conv2D(64, 5, activation='relu'))
+        self.cnn.add(gluon.nn.MaxPool2D(2, 2))
+        self.cnn.add(gluon.nn.Conv2D(64, 5, activation='relu'))
+        self.cnn.add(gluon.nn.MaxPool2D(2, 2))
+        self.cnn.add(gluon.nn.Dense(256, activation='relu'))
+        self.cnn.add(gluon.nn.Dense(2, activation='softrelu'))
 
     def hybrid_forward(self, F, input0, input1):
         out0 = self.cnn(input0)
diff --git a/docs/python_docs/python/tutorials/packages/gluon/text/transformer.rst b/docs/python_docs/python/tutorials/packages/gluon/text/transformer.rst
index 6fb27653872d..ab1db2f8a134 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/text/transformer.rst
+++ b/docs/python_docs/python/tutorials/packages/gluon/text/transformer.rst
@@ -530,14 +530,14 @@ robust for the machine translation task.
     #We use warmup steps as introduced in [1].
     warmup_steps = hparams.warmup_steps
     grad_interval = hparams.num_accumulated
-    model.collect_params().setattr('grad_req', 'add')
+    model.setattr('grad_req', 'add')
     #We use Averaging SGD [2] to update the parameters.
     average_start = (len(train_data_loader) // grad_interval) * \
         (hparams.epochs - hparams.average_start)
     average_param_dict = {k: mx.nd.array([0]) for k, v in
                                           model.collect_params().items()}
     update_average_param_dict = True
-    model.collect_params().zero_grad()
+    model.zero_grad()
     for epoch_id in range(hparams.epochs):
         utils.train_one_epoch(epoch_id, model, train_data_loader, trainer,
                               label_smoothing, loss_function, grad_interval,
diff --git a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md
index ad948740a52f..86d0f8fdd2c8 100644
--- a/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md
+++ b/docs/python_docs/python/tutorials/packages/gluon/training/learning_rates/learning_rate_schedules.md
@@ -156,20 +156,19 @@ train_dataloader = mx.gluon.data.DataLoader(train_dataset, batch_size, shuffle=T
 # Build a simple convolutional network
 def build_cnn():
     net = nn.HybridSequential()
-    with net.name_scope():
-        # First convolution
-        net.add(nn.Conv2D(channels=10, kernel_size=5, activation='relu'))
-        net.add(nn.MaxPool2D(pool_size=2, strides=2))
-        # Second convolution
-        net.add(nn.Conv2D(channels=20, kernel_size=5, activation='relu'))
-        net.add(nn.MaxPool2D(pool_size=2, strides=2))
-        # Flatten the output before the fully connected layers
-        net.add(nn.Flatten())
-        # First fully connected layers with 512 neurons
-        net.add(nn.Dense(512, activation="relu"))
-        # Second fully connected layer with as many neurons as the number of classes
-        net.add(nn.Dense(num_outputs))
-        return net
+    # First convolution
+    net.add(nn.Conv2D(channels=10, kernel_size=5, activation='relu'))
+    net.add(nn.MaxPool2D(pool_size=2, strides=2))
+    # Second convolution
+    net.add(nn.Conv2D(channels=20, kernel_size=5, activation='relu'))
+    net.add(nn.MaxPool2D(pool_size=2, strides=2))
+    # Flatten the output before the fully connected layers
+    net.add(nn.Flatten())
+    # First fully connected layers with 512 neurons
+    net.add(nn.Dense(512, activation="relu"))
+    # Second fully connected layer with as many neurons as the number of classes
+    net.add(nn.Dense(num_outputs))
+    return net
     
 net = build_cnn()
 ```
@@ -179,7 +178,7 @@ We then initialize our network (technically deferred until we pass the first bat
 
 ```python
 # Initialize the parameters with Xavier initializer
-net.collect_params().initialize(mx.init.Xavier(), ctx=ctx)
+net.initialize(mx.init.Xavier(), ctx=ctx)
 # Use cross entropy loss
 softmax_cross_entropy = mx.gluon.loss.SoftmaxCrossEntropyLoss()
 ```
@@ -280,7 +279,7 @@ We replicate the example above, but now keep track of the `iteration_idx`, call
 
 ```python
 net = build_cnn()
-net.collect_params().initialize(mx.init.Xavier(), ctx=ctx)
+net.initialize(mx.init.Xavier(), ctx=ctx)
 
 schedule = mx.lr_scheduler.MultiFactorScheduler(step=steps_iterations, factor=0.1)
 schedule.base_lr = 0.03
diff --git a/docs/python_docs/python/tutorials/packages/ndarray/sparse/train_gluon.md b/docs/python_docs/python/tutorials/packages/ndarray/sparse/train_gluon.md
index 9479eb8d6a0c..8b8e551bca22 100644
--- a/docs/python_docs/python/tutorials/packages/ndarray/sparse/train_gluon.md
+++ b/docs/python_docs/python/tutorials/packages/ndarray/sparse/train_gluon.md
@@ -221,14 +221,13 @@ Gluon's [nn.Dense](/api/python/docs/api/gluon/nn/index.html#mxnet.gluon.nn.Dense
 class FullyConnected(mx.gluon.HybridBlock):
     def __init__(self, in_units, units):
         super(FullyConnected, self).__init__()
-        with self.name_scope():
-            self._units = units
-            self.weight = self.params.get('weight', shape=(units, in_units),
-                                          init=None, allow_deferred_init=True,
-                                          dtype='float32', stype='default', grad_stype='default')
-            self.bias = self.params.get('bias', shape=(units),
-                                          init='zeros', allow_deferred_init=True,
-                                          dtype='float32', stype='default', grad_stype='default')
+        self._units = units
+        self.weight = mx.gluon.Parameter('weight', shape=(units, in_units),
+                                         init=None, allow_deferred_init=True,
+                                         dtype='float32', stype='default', grad_stype='default')
+        self.bias = mx.gluon.Parameter('bias', shape=(units),
+                                        init='zeros', allow_deferred_init=True,
+                                        dtype='float32', stype='default', grad_stype='default')
 
     def hybrid_forward(self, F, x, weight, bias):
         return F.FullyConnected(x, weight, bias, num_hidden=self._units)
@@ -245,14 +244,13 @@ We could instead have created our parameter with shape `(in_units, units)` and a
 class FullyConnectedSparse(mx.gluon.HybridBlock):
     def __init__(self, in_units, units, weight_grad_stype='default'):
         super(FullyConnectedSparse, self).__init__()
-        with self.name_scope():
-            self._units = units
-            self.weight = self.params.get('weight', shape=(in_units, units),
-                                          init=None, allow_deferred_init=True,
-                                          dtype='float32', stype='default', grad_stype=weight_grad_stype)
-            self.bias = self.params.get('bias', shape=(units),
-                                          init='zeros', allow_deferred_init=True,
-                                          dtype='float32', stype='default', grad_stype='default')
+        self._units = units
+        self.weight = gluon.Parameter('weight', shape=(in_units, units),
+                                      init=None, allow_deferred_init=True,
+                                      dtype='float32', stype='default', grad_stype=weight_grad_stype)
+        self.bias = gluon.Parameter('bias', shape=(units),
+                                    init='zeros', allow_deferred_init=True,
+                                    dtype='float32', stype='default', grad_stype='default')
 
     def hybrid_forward(self, F, x, weight, bias):
         return F.sparse.dot(x, weight) + bias
diff --git a/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md b/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md
index e1eb3044a9fa..4b90acc8d1bd 100644
--- a/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md
+++ b/docs/python_docs/python/tutorials/packages/onnx/fine_tuning_gluon.md
@@ -306,9 +306,8 @@ We add the SymbolBlock and the new dense layer to a HybridSequential network
 
 ```python
 net = gluon.nn.HybridSequential()
-with net.name_scope():
-    net.add(pre_trained)
-    net.add(dense_layer)
+net.add(pre_trained)
+net.add(dense_layer)
 ```
 
 ### Loss
diff --git a/docs/python_docs/python/tutorials/performance/backend/amp.md b/docs/python_docs/python/tutorials/performance/backend/amp.md
index 289e783a0fb8..1a28f9179703 100644
--- a/docs/python_docs/python/tutorials/performance/backend/amp.md
+++ b/docs/python_docs/python/tutorials/performance/backend/amp.md
@@ -98,7 +98,7 @@ def get_network():
         warnings.simplefilter("ignore")
         net = get_model(net_name, pretrained_base=True, norm_layer=gluon.nn.BatchNorm)
         net.initialize()
-        net.collect_params().reset_ctx(ctx)
+        net.reset_ctx(ctx)
 
     return net
 ```
@@ -272,7 +272,7 @@ with mx.Context(mx.gpu(0)):
     with warnings.catch_warnings(record=True) as w:
         warnings.simplefilter("ignore")
         model = get_model("resnet50_v1")
-        model.collect_params().initialize(ctx=mx.current_context())
+        model.initialize(ctx=mx.current_context())
         model.hybridize()
         model(mx.nd.zeros((1, 3, 224, 224)))
         converted_model = amp.convert_hybrid_block(model)
diff --git a/docs/python_docs/python/tutorials/performance/backend/profiler.md b/docs/python_docs/python/tutorials/performance/backend/profiler.md
index b9798a6c6594..1c63cc96d6d9 100644
--- a/docs/python_docs/python/tutorials/performance/backend/profiler.md
+++ b/docs/python_docs/python/tutorials/performance/backend/profiler.md
@@ -78,14 +78,13 @@ Let's build a small convolutional neural network that we can use to demonstrate
 from mxnet import gluon
 
 net = gluon.nn.HybridSequential()
-with net.name_scope():
-    net.add(gluon.nn.Conv2D(channels=20, kernel_size=5, activation='relu'))
-    net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
-    net.add(gluon.nn.Conv2D(channels=50, kernel_size=5, activation='relu'))
-    net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
-    net.add(gluon.nn.Flatten())
-    net.add(gluon.nn.Dense(512, activation="relu"))
-    net.add(gluon.nn.Dense(10))
+net.add(gluon.nn.Conv2D(channels=20, kernel_size=5, activation='relu'))
+net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
+net.add(gluon.nn.Conv2D(channels=50, kernel_size=5, activation='relu'))
+net.add(gluon.nn.MaxPool2D(pool_size=2, strides=2))
+net.add(gluon.nn.Flatten())
+net.add(gluon.nn.Dense(512, activation="relu"))
+net.add(gluon.nn.Dense(10))
 ```
 
 We need data that we can run through the network for profiling. We'll use the MNIST dataset.
@@ -108,7 +107,7 @@ else:
     ctx=mx.cpu()
 
 # Initialize the parameters with random weights
-net.collect_params().initialize(mx.init.Xavier(), ctx=ctx)
+net.initialize(mx.init.Xavier(), ctx=ctx)
 
 # Use SGD optimizer
 trainer = gluon.Trainer(net.collect_params(), 'sgd', {'learning_rate': 0.1})