Merge branch 'main' into upstream_irmodule_parser_from_unity_pr_14487

apache · Apr 5, 2023 · a229184 · a229184
2 parents 08a9187 + e51ba29
commit a229184
Show file tree

Hide file tree

Showing 64 changed files with 1,994 additions and 185 deletions.
diff --git a/docker/Dockerfile.ci_cpu b/docker/Dockerfile.ci_cpu
@@ -47,6 +47,9 @@ RUN bash /install/ubuntu_install_python_package.sh
 COPY install/ubuntu1804_install_llvm.sh /install/ubuntu1804_install_llvm.sh
 RUN bash /install/ubuntu1804_install_llvm.sh
 
+COPY install/ubuntu_install_llvm_from_source.sh /install/ubuntu_install_llvm_from_source.sh
+RUN bash /install/ubuntu_install_llvm_from_source.sh 15.0.7 8b5fcb24b4128cf04df1b0b9410ce8b1a729cb3c544e6da885d234280dedeac6
+
 COPY install/ubuntu_install_dnnl.sh /install/ubuntu_install_dnnl.sh
 RUN bash /install/ubuntu_install_dnnl.sh
 

diff --git a/docker/install/ubuntu_install_llvm_from_source.sh b/docker/install/ubuntu_install_llvm_from_source.sh
@@ -60,6 +60,7 @@ cmake \
     -DLLVM_ENABLE_ASSERTIONS=ON \
     -DLLVM_ENABLE_RTTI=ON \
     -DLLVM_ENABLE_OCAMLDOC=OFF \
+    -DLLVM_ENABLE_PROJECTS=mlir \
     -DLLVM_USE_INTEL_JITEVENTS=ON \
     -DLLVM_TEMPORARILY_ALLOW_OLD_TOOLCHAIN=ON \
     -DPYTHON_EXECUTABLE="$(cpython_path 3.7)/bin/python" \

diff --git a/include/tvm/relay/attrs/nn.h b/include/tvm/relay/attrs/nn.h
@@ -671,10 +671,10 @@ struct Conv1DTransposeAttrs : public tvm::AttrsNode<Conv1DTransposeAttrs> {
             "dimensions respectively. Convolution is applied on the"
             "'W' dimension.");
     TVM_ATTR_FIELD(kernel_layout)
-        .set_default("OIW")
+        .set_default("IOW")
         .describe(
-            "Dimension ordering of data and weight. Can be 'OIW', 'OIW16o16i', etc."
-            "'O', 'I', 'W' stands for num_filter, input_channel, and width"
+            "Dimension ordering of data and weight. Can be 'IOW', 'IOW16o16i', etc."
+            "'I', 'O', 'W' stands for input_channel, num_filter and width"
             "dimensions respectively.");
     TVM_ATTR_FIELD(out_layout)
         .set_default("")

diff --git a/include/tvm/tir/schedule/schedule.h b/include/tvm/tir/schedule/schedule.h
@@ -292,6 +292,16 @@ class ScheduleNode : public runtime::Object {
    */
   virtual Array<BlockRV> GetConsumers(const BlockRV& block_rv) = 0;
   /******** Schedule: Transform loops ********/
+  /*!
+   * \brief Merge a list of loops into one. The loops under their LCA requires:
+   * 1) Under the same scope
+   * 2) Can't have annotations or thread bindings
+   * 3) Start with 0 and have same extent and same nesting depth
+   * 4) From target loop to their LCA, the inner loop must be the only child of the outer loop
+   * \param loop_rvs The loops to be merged
+   * \return The new loop after merge
+   */
+  virtual LoopRV Merge(const Array<LoopRV>& loop_rvs) = 0;
   /*!
    * \brief Fuse a list of consecutive loops into one. It requires:
    * 1) The loops can't have annotations or thread bindings.
@@ -328,6 +338,12 @@ class ScheduleNode : public runtime::Object {
    * \param ordered_loop_rvs The loops in the new order
    */
   virtual void Reorder(const Array<LoopRV>& ordered_loop_rvs) = 0;
+  /*!
+   * \brief Reorder the itervars inside a block.
+   * \param block_rv The block to be transformed.
+   * \param new_order The new itervar order.
+   */
+  virtual void ReorderBlockIterVar(const BlockRV& block_rv, const Array<Integer> new_order) = 0;
   /*!
    * \brief Create a new unit loop on top of the specific block.
    * \param block_rv The block above which the new loop is created

diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py
@@ -282,6 +282,8 @@ def _convert_dense(
 
 
 def _convert_convolution1d(inexpr, keras_layer, etab, data_layout, input_shape=None):
+    is_deconv = type(keras_layer).__name__ == "Conv1DTranspose"
+
     if input_shape is None:
         input_shape = keras_layer.input_shape
     _check_data_format(keras_layer)
@@ -290,19 +292,21 @@ def _convert_convolution1d(inexpr, keras_layer, etab, data_layout, input_shape=N
 
     if data_layout == "NWC":
         kernel_layout = "WIO"
+        if is_deconv:
+            kernel_layout = "WOI"
     else:
         kernel_layout = "OIW"
+        if is_deconv:
+            kernel_layout = "IOW"
         msg = (
             "Kernel layout with {} is not supported for operator Convolution1D "
             "in frontend Keras."
         )
         raise tvm.error.OpAttributeUnImplemented(msg.format(data_layout))
 
-    is_deconv = type(keras_layer).__name__ == "Conv1DTranspose"
-
     if is_deconv:
-        if kernel_layout == "OIW":
-            weight = weight.transpose([2, 0, 1])
+        if kernel_layout == "IOW":
+            weight = weight.transpose([2, 1, 0])
         kernel_w, n_filters, _ = weight.shape
     else:
         kernel_w, _, n_filters = weight.shape

diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py
@@ -304,7 +304,7 @@ def _mx_conv1d_transpose(inputs, attrs):
     if data_layout != "NCW":
         raise tvm.error.OpAttributeInvalid('Only "NCW" data layout is supported for 1D Convolution')
     channel_axis = 1
-    kernel_layout = "OIW"
+    kernel_layout = "IOW"
     new_attrs = {}
     new_attrs["channels"] = attrs.get_int("num_filter")
     new_attrs["kernel_size"] = attrs.get_int_tuple("kernel")

diff --git a/python/tvm/relay/frontend/oneflow.py b/python/tvm/relay/frontend/oneflow.py
@@ -84,8 +84,8 @@ def get_node_info(node):
     shape = tuple(node.input_conf.blob_conf.shape.dim)
     # get data type
     dtype = node.input_conf.blob_conf.data_type
-    if dtype in list(FLOW_2_NP_DTYPE.keys()):
-        data_type = FLOW_2_NP_DTYPE[dtype]
+    if dtype in list(FLOW_2_STR_DTYPE.keys()):
+        data_type = FLOW_2_STR_DTYPE[dtype]
     else:
         raise IndexError("Please check the data type of your node: %s" % node.name)
 

diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py
@@ -1263,6 +1263,9 @@ def convolution(self, inputs, input_types):
         else:
             data_layout = "NCW"
             kernel_layout = "OIW"
+            if use_transpose:
+                # Transposed convolutions have IOW layout.
+                kernel_layout = "IOW"
 
         # Conv1d does not currently support grouped convolution so we convert it to conv2d
         is_grouped_conv1d = False

diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py
@@ -359,6 +359,10 @@ def qnn_conv2d(expr):
     kernel_typ = args[1].checked_type
     if len(kernel_typ.shape) != 4 or kernel_typ.dtype not in qnn_dtypes:
         return False
+    if is_per_channel_quantization(
+        zero_point=args[2], scale=args[4]
+    ) or is_per_channel_quantization(zero_point=args[3], scale=args[5]):
+        return False
     is_depthwise = is_depthwise_conv2d(
         data_typ.shape,
         attrs["data_layout"],
@@ -422,6 +426,10 @@ def qnn_dense(expr):
         return False
     if attrs.out_dtype != "int32":
         return False
+    if is_per_channel_quantization(
+        zero_point=args[2], scale=args[4]
+    ) or is_per_channel_quantization(zero_point=args[3], scale=args[5]):
+        return False
     return True
 
 
@@ -514,10 +522,24 @@ def qnn_add(expr):
     for typ in [args[0].checked_type, args[1].checked_type]:
         if typ.dtype not in ["int8", "uint8"]:
             return False
-
+    if (
+        is_per_channel_quantization(zero_point=args[3], scale=args[2])
+        or is_per_channel_quantization(zero_point=args[5], scale=args[4])
+        or is_per_channel_quantization(zero_point=args[7], scale=args[6])
+    ):
+        return False
     return True
 
 
+def is_per_channel_quantization(zero_point, scale):
+    """Check if the quantization is per-channel"""
+    for value in [zero_point, scale]:
+        shape = value.checked_type.shape
+        if len(shape) != 0 and shape[0] != 1:
+            return True
+    return False
+
+
 class OpAttrContext(object):
     """Temporarily changes the attr of an op."""
 

diff --git a/python/tvm/relay/op/contrib/ethosu.py b/python/tvm/relay/op/contrib/ethosu.py
@@ -1671,7 +1671,18 @@ def check_compatible_size(mode, method, upscale_size, ifm_size):
             return False
         if self.method not in ("nearest_neighbor", "linear"):
             return False
-        if self.coordinate_transformation_mode not in ("asymmetric", "align_corners"):
+        if self.coordinate_transformation_mode not in (
+            "asymmetric",
+            "align_corners",
+            "half_pixel",
+        ):
+            return False
+        if (
+            self.coordinate_transformation_mode == "half_pixel"
+            and self.rounding_method != "round_prefer_ceil"
+            or self.coordinate_transformation_mode != "half_pixel"
+            and self.rounding_method != ""
+        ):
             return False
         if not check_compatible_size(
             self.coordinate_transformation_mode,
@@ -1680,8 +1691,6 @@ def check_compatible_size(mode, method, upscale_size, ifm_size):
             self.ifm.shape[1:3],
         ):
             return False
-        if self.rounding_method != "":
-            return False
         if self.out_dtype and self.out_dtype != "int8":
             return False
         return True

diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py
@@ -604,7 +604,7 @@ def conv1d_transpose(
     channels=None,
     kernel_size=None,
     data_layout="NCW",
-    kernel_layout="OIW",
+    kernel_layout="IOW",
     out_layout="",
     output_padding=(0,),
     out_dtype="",

diff --git a/python/tvm/script/parser/core/parser.py b/python/tvm/script/parser/core/parser.py
@@ -190,10 +190,11 @@ def exist(self, value: Any) -> bool:
         res : bool
             The existence of the value.
         """
-        for v in self.name2value.values():
-            if v is value:
-                return True
-        return False
+        return any(
+            value.same_as(known_value)
+            for known_value_stack in self.name2value.values()
+            for known_value in known_value_stack
+        )
 
 
 def _dispatch_wrapper(func: dispatch.ParseMethod) -> dispatch.ParseMethod:

diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py
@@ -527,7 +527,7 @@ def tvm_struct_set(arr, index, field, value):
     call : PrimExpr
         The call expression.
     """
-    return call_intrin("handle", "tir.tvm_struct_set", arr, index, field, value)
+    return call_intrin("int32", "tir.tvm_struct_set", arr, index, field, value)
 
 
 def address_of(buffer_load, span=None):

diff --git a/python/tvm/tir/schedule/schedule.py b/python/tvm/tir/schedule/schedule.py
@@ -541,6 +541,84 @@ def get_consumers(self, block: Union[BlockRV, str]) -> List[BlockRV]:
         return list(_ffi_api.ScheduleGetConsumers(self, block))  # type: ignore # pylint: disable=no-member
 
     ########## Schedule: Transform loops ##########
+    @type_checked
+    def merge(
+        self,
+        *loops: List[LoopRV],
+    ) -> LoopRV:
+        """Merge a list of loops into one. The loops under their LCA requires:
+        1) Under the same scope.
+        2) Can't have annotations or thread bindings.
+        3) Start with 0 and have same extent and same nesting depth.
+        4) From target loop to their LCA, The inner loop must be the only child of the outer loop.
+
+        Parameters
+        ----------
+        *loops : List[LoopRV]
+            The loops to be merged
+
+        Returns
+        -------
+        fused_loop : LoopRV
+            The new loop after merge
+
+        Examples
+        --------
+
+        Before applying merge, in TensorIR, the IR is:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def before_merge(a: T.handle, b: T.handle, c: T.handle) -> None:
+                A = T.match_buffer(a, (128, 128))
+                B = T.match_buffer(b, (128, 128))
+                C = T.match_buffer(c, (128, 128))
+                for i, j in T.grid(128, 128):
+                    with T.block("B"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        B[vi, vj] = A[vi, vj] * 2.0
+                for i, j in T.grid(128, 128):
+                    with T.block("C"):
+                        vi, vj = T.axis.remap("SS", [i, j])
+                        C[vi, vj] = A[vi, vj] * 2.0
+
+        Create the schedule and do fuse:
+
+        .. code-block:: python
+
+            sch = tir.Schedule(before_fuse)
+            i1, _ = sch.get_loops(sch.get_block("B"))
+            i2, _ = sch.get_loops(sch.get_block("C"))
+            sch.merge(i1, i2)
+            print(sch.mod["main"].script())
+
+        After applying fuse, the IR becomes:
+
+        .. code-block:: python
+
+            @T.prim_func
+            def after_fuse(a: T.handle, b: T.handle, c: T.handle) -> None:
+                A = T.match_buffer(a, (128, 128))
+                B = T.match_buffer(b, (128, 128))
+                C = T.match_buffer(c, (128, 128))
+                # the 2 loops are merged into 1
+                for i_m in range(128):
+                    for j in range(128):
+                        with T.block("B"):
+                            vi, vj = T.axis.remap("SS", [i_m, j])
+                            T.reads(A[vi, vj])
+                            T.writes(B[vi, vj])
+                            B[vi, vj] = A[vi, vj] * T.float32(2)
+                    for j in range(128):
+                        with T.block("C"):
+                            vi, vj = T.axis.remap("SS", [i_m, j])
+                            T.reads(A[vi, vj])
+                            T.writes(C[vi, vj])
+                            C[vi, vj] = A[vi, vj] * T.float32(2)
+        """
+        return _ffi_api.ScheduleMerge(self, loops)  # type: ignore # pylint: disable=no-member
+
     @type_checked
     def fuse(
         self,
@@ -752,6 +830,19 @@ def after_reorder(a: T.handle, b: T.handle) -> None:
         """
         _ffi_api.ScheduleReorder(self, ordered_loops)  # type: ignore # pylint: disable=no-member
 
+    @type_checked
+    def reorder_block_iter_var(self, block: BlockRV, new_order: List[int]) -> None:
+        """Reorder the itervars inside a given block.
+
+        Parameters
+        ----------
+        block : BlockRV
+            The block to be transformed.
+        new_order : List[int]
+            The new block itervar order.
+        """
+        _ffi_api.ScheduleReorderBlockIterVar(self, block, new_order)  # type: ignore # pylint: disable=no-member
+
     @type_checked
     def add_unit_loop(self, block_or_loop: Union[LoopRV, BlockRV]) -> LoopRV:
         """Create a new unit loop on top of the specific block or loop.

diff --git a/python/tvm/tir/tensor_intrin/cuda.py b/python/tvm/tir/tensor_intrin/cuda.py
@@ -245,7 +245,7 @@ def mma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
             for i, j, k in T.grid(M_DIM, N_DIM, k_dim):
                 with T.block("C"):
                     i, j, k = T.axis.remap("SSR", [i, j, k])
-                    b_row_ind, b_col_ind = maybe_swap(k, j)
+                    b_row_ind, b_col_ind = T.meta_var(maybe_swap(k, j))
 
                     thread_id_C, local_id_C = T.meta_var(index_map_C(i, j))
                     thread_id_A, local_id_A = T.meta_var(index_map_A(i, k))
@@ -719,7 +719,7 @@ def wmma_sync_desc(a: T.handle, b: T.handle, c: T.handle) -> None:
             for i, j, k in T.grid(m_dim, n_dim, k_dim):
                 with T.block(""):
                     vii, vjj, vkk = T.axis.remap("SSR", [i, j, k])
-                    B_index_0, B_index_1 = maybe_swap(vkk, vjj)
+                    B_index_0, B_index_1 = T.meta_var(maybe_swap(vkk, vjj))
                     C[vii, vjj] = C[vii, vjj] + maybe_cast(A[vii, vkk]) * maybe_cast(
                         B[B_index_0, B_index_1]
                     )