diff --git a/CMakeLists.txt b/CMakeLists.txt
index a0f153a01c6c..dbf9ecd85da9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -299,6 +299,16 @@ if(USE_MKLDNN)
   endif()
 
   function(load_mkldnn)
+    if (MKLDNN_USE_ACL)
+      # C++ 14 is requried to build with ACL
+      # Also ACL_ROOT_DIR need to be set
+      set(CMAKE_CXX_STANDARD 14)
+      set(DNNL_AARCH64_USE_ACL ON CACHE INTERNAL "" FORCE)
+    endif()
+    if (MKLDNN_USE_APL)
+      # APL needs to be added to LD_LIBRARY_PATH
+      set(DNNL_BLAS_VENDOR "ARMPL" CACHE INTERNAL "" FORCE)
+    endif()
     set(MKLDNN_BUILD_TESTS OFF CACHE INTERNAL "" FORCE)
     set(MKLDNN_BUILD_EXAMPLES OFF CACHE INTERNAL "" FORCE)
     set(MKLDNN_ARCH_OPT_FLAGS "" CACHE INTERNAL "" FORCE)
diff --git a/config/distribution/linux_aarch64_cpu.cmake b/config/distribution/linux_aarch64_cpu.cmake
index 98da9c2a60a8..391d34164161 100644
--- a/config/distribution/linux_aarch64_cpu.cmake
+++ b/config/distribution/linux_aarch64_cpu.cmake
@@ -30,17 +30,13 @@ set(USE_F16C OFF CACHE BOOL "Build with x86 F16C instruction support")
 set(USE_DIST_KVSTORE OFF CACHE BOOL "Build with DIST_KVSTORE support")
 
 set(USE_MKLDNN ON CACHE BOOL "Build with MKL-DNN support")
-# Uncomment the following line to build MKLDNN with APL support
-# APL can be downloaded from https://developer.arm.com/tools-and-software/server-and-hpc/downloads/arm-performance-libraries
-# APL needs to be added to LD_LIBRARY_PATH
-## set(DNNL_BLAS_VENDOR “ARMPL” CACHE STRING “Build MKLDNN with Arm Performance Libraries as the BLAS library”)
-# Uncomment the following lines to build MKLDNN with ACL support
-# C++ 14 is requried to build with ACL and MKLDNN recommends building ACL from source rather than
-# using the pre-built binaries from https://github.com/ARM-software/ComputeLibrary/releases
-# If pre-built binaries are used anyways, make sure to copy and rename the appropriate binaries
-# folder from <acl_root>/lib/<binaries_folder> to <acl_root>/build
+# Pre-built binaries are available from  https://github.com/ARM-software/ComputeLibrary/releases
+# Make sure to copy and rename the appropriate binaries folder
+# from <acl_root>/lib/<binaries_folder_for_your_arch> to <acl_root>/build
 # The resulting acl root folder should look something like:
 # LICENSE README.md arm_compute build examples include lib scripts support utils
-## set(CMAKE_CXX_STANDARD 14)
-## set(ENV{ACL_ROOT_DIR} ~/arm_compute-v21.02-bin-linux)
-## set(DNNL_AARCH64_USE_ACL ON CACHE BOOL “Build MKLDNN with Arm Compute Library integration”)
+set(ENV{ACL_ROOT_DIR} "")
+set(MKLDNN_USE_ACL OFF CACHE BOOL "Integrate MKLDNN with Arm Compute Library")
+# APL can be downloaded from https://developer.arm.com/tools-and-software/server-and-hpc/downloads/arm-performance-libraries
+# Note that APL needs to be added to LD_LIBRARY_PATH
+set(MKLDNN_USE_APL OFF CACHE BOOL "Integrate MKLDNN with Arm Performance Libraries")
diff --git a/config/linux_arm.cmake b/config/linux_arm.cmake
new file mode 100644
index 000000000000..6e6c01d877c4
--- /dev/null
+++ b/config/linux_arm.cmake
@@ -0,0 +1,149 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+#-------------------------------------------------------------------------------
+#  Template configuration for compiling MXNet
+#
+#  If you want to change the configuration, please use the following steps.
+#  Assume you are on the root directory of mxnet. First copy this file so that
+#  any local changes will be ignored by git
+#
+#  $ cp config/linux_arm.cmake config.cmake
+#
+#  Next modify the according entries, and then compile by
+#
+#  $ mkdir build; cd build
+#  $ cmake ..
+#  $ cmake --build .
+#
+# Specify `cmake --build . --parallel N` to set the number of parallel compilation jobs.
+# Default is derived from CPUs available.
+#
+#-------------------------------------------------------------------------------
+
+#---------------------------------------------
+# Arm flags
+#---------------------------------------------
+# Set the correct C and CXX flags according to your Arm processor's architecture
+# e.g. "armv8-a"
+set(CFLAGS "-march=armv8-a" CACHE STRING "CFLAGS")
+set(CXXFLAGS "-march=armv8-a" CACHE STRING "CXXFLAGS")
+
+#---------------------------------------------
+# GPU support
+#---------------------------------------------
+set(USE_CUDA OFF CACHE BOOL "Build with CUDA support")
+set(USE_CUDNN OFF CACHE BOOL "Build with cudnn support, if found")
+
+# Target NVIDIA GPU achitecture.
+# Valid options are "Auto" for autodetection, "All" for all available
+# architectures or a list of architectures by compute capability number, such as
+# "7.0" or "7.0;7.5" as well as name, such as "Volta" or "Volta;Turing".
+# The value specified here is passed to cmake's CUDA_SELECT_NVCC_ARCH_FLAGS to
+# obtain the compilation flags for nvcc.
+#
+# When compiling on a machine without GPU, autodetection will fail and you
+# should instead specify the target architecture manually to avoid excessive
+# compilation times.
+set(MXNET_CUDA_ARCH "Auto" CACHE STRING "Target NVIDIA GPU achitecture")
+
+#---------------------------------------------
+# Common libraries
+#---------------------------------------------
+set(USE_BLAS "open" CACHE STRING "BLAS Vendor")
+
+set(USE_OPENCV ON CACHE BOOL "Build with OpenCV support")
+set(OPENCV_ROOT "" CACHE BOOL "OpenCV install path. Supports autodetection.")
+
+set(USE_OPENMP ON CACHE BOOL "Build with Openmp support")
+
+set(USE_MKL_IF_AVAILABLE ON CACHE BOOL "Use Intel MKL if found")
+set(USE_MKLDNN ON CACHE BOOL "Build with MKL-DNN support")
+
+set(USE_LAPACK ON CACHE BOOL "Build with lapack support")
+
+set(USE_TVM_OP OFF CACHE BOOL "Enable use of TVM operator build system.")
+
+# Integrate MKLDNN with Arm Performance Libraries
+# Note that APL needs to be added to LD_LIBRARY_PATH
+set(MKLDNN_USE_APL OFF CACHE BOOL "Integrate MKLDNN with Arm Performance Libraries")
+
+# Integrate MKLDNN with Arm Compute Library
+set(ENV{ACL_ROOT_DIR} "")
+set(MKLDNN_USE_ACL OFF CACHE BOOL "Integrate MKLDNN with Arm Compute Library")
+
+#---------------------
+# Compilers
+#--------------------
+set(CMAKE_GENERATOR "Ninja" CACHE STRING "Build Tool Generator used by CMake")
+
+# Compilers are usually autodetected. Uncomment and modify the next 3 lines to
+# choose manually:
+
+# set(CMAKE_C_COMPILER "" CACHE BOOL "C compiler")
+# set(CMAKE_CXX_COMPILER "" CACHE BOOL "C++ compiler")
+# set(CMAKE_CUDA_COMPILER "" CACHE BOOL "Cuda compiler (nvcc)")
+
+# Uncomment the following line to compile with debug information
+# set(CMAKE_BUILD_TYPE Debug CACHE STRING "CMake build type")
+
+#---------------------------------------------
+# CPU instruction sets: The support is autodetected if turned ON
+#---------------------------------------------
+set(USE_SSE OFF CACHE BOOL "Build with x86 SSE instruction support")
+set(USE_F16C OFF CACHE BOOL "Build with x86 F16C instruction support")
+
+
+#----------------------------
+# distributed computing
+#----------------------------
+set(USE_DIST_KVSTORE OFF CACHE BOOL "Build with DIST_KVSTORE support")
+
+
+#----------------------------
+# performance settings
+#----------------------------
+set(USE_OPERATOR_TUNING ON CACHE BOOL  "Enable auto-tuning of operators")
+set(USE_GPERFTOOLS OFF CACHE BOOL "Build with GPerfTools support")
+set(USE_JEMALLOC OFF CACHE BOOL "Build with Jemalloc support")
+
+
+#----------------------------
+# additional operators
+#----------------------------
+# path to folders containing projects specific operators that you don't want to
+# put in src/operators
+SET(EXTRA_OPERATORS "" CACHE PATH "EXTRA OPERATORS PATH")
+
+
+#----------------------------
+# other features
+#----------------------------
+# Create C++ interface package
+set(USE_CPP_PACKAGE OFF CACHE BOOL "Build C++ Package")
+
+# Use int64_t type to represent the total number of elements in a tensor
+# This will cause performance degradation reported in issue #14496
+# Set to 1 for large tensor with tensor size greater than INT32_MAX i.e. 2147483647
+# Note: the size of each dimension is still bounded by INT32_MAX
+set(USE_INT64_TENSOR_SIZE OFF CACHE BOOL "Use int64_t to represent the total number of elements in a tensor")
+
+# Other GPU features
+set(USE_NCCL "Use NVidia NCCL with CUDA" OFF)
+set(NCCL_ROOT "" CACHE BOOL "NCCL install path. Supports autodetection.")
+set(ENABLE_CUDA_RTC OFF CACHE BOOL "Build with CUDA runtime compilation support")
+set(USE_NVTX OFF CACHE BOOL "Build with NVTX support")
diff --git a/docs/python_docs/environment.yml b/docs/python_docs/environment.yml
index 7856a076889b..6c4a5beebe66 100644
--- a/docs/python_docs/environment.yml
+++ b/docs/python_docs/environment.yml
@@ -26,6 +26,7 @@ dependencies:
 - sphinx==2.4.0
 - matplotlib
 - notebook
+- Jinja2==2.11.3
 - pip:
   - nbconvert==5.6.1
   - nbsphinx==0.4.3