octoml · Lunderberg · Jan 11, 2024 · Jan 11, 2024 · Jan 11, 2024 · Feb 14, 2024
diff --git a/mlc_llm/core.py b/mlc_llm/core.py
diff --git a/mlc_llm/relax_model/commons.py b/mlc_llm/relax_model/commons.py
@@ -85,6 +85,27 @@ def shard_k_weight_scale(weight: relax.TensorStructInfo):
         func = te.create_prim_func([a, w])
         return func
 
+    def shard_axis_0(weight: relax.TensorStructInfo):
+        (red, spatial), dtype = weight.shape, weight.dtype
+        red, spatial = int(red), int(spatial)
+        if param_shape_is_already_sharded:
+            red *= num_shards
+        a = te.placeholder((red, spatial), dtype=dtype)
+        w = topi.reshape(a, (num_shards, red // num_shards, spatial))
+        func = te.create_prim_func([a, w])
+        return func
+
+    def shard_axis_1(weight: relax.TensorStructInfo):
+        (spatial, red), dtype = weight.shape, weight.dtype
+        spatial, red = int(spatial), int(red)
+        if param_shape_is_already_sharded:
+            red *= num_shards
+        a = te.placeholder((spatial, red), dtype=dtype)
+        w = topi.reshape(a, (spatial, num_shards, red // num_shards))
+        w = topi.transpose(w, (1, 0, 2))
+        func = te.create_prim_func([a, w])
+        return func
+
     def shard_gate_up_weight_scale(weight: relax.TensorStructInfo):
         (spatial, red), dtype = weight.shape, weight.dtype
         spatial, red = int(spatial), int(red)
@@ -135,6 +156,8 @@ def moe_shard_gate_up_weight_scale(weight: relax.TensorStructInfo):
         "shard_mlp_k": shard_k_weight_scale,
         "shard_o_proj_k": shard_k_weight_scale,
         "shard_gate_up": shard_gate_up_weight_scale,
+        "shard_axis_0": shard_axis_0,
+        "shard_axis_1": shard_axis_1,
         "moe_shard_mlp_k": moe_shard_k_weight_scale,
         "moe_shard_gate_up": moe_shard_gate_up_weight_scale,
     }
@@ -176,6 +199,27 @@ def shard_k_weight(weight: relax.TensorStructInfo):
         func = te.create_prim_func([a, w])
         return func
 
+    def shard_axis_0(weight: relax.TensorStructInfo):
+        (red, spatial), dtype = weight.shape, weight.dtype
+        red, spatial = int(red), int(spatial)
+        if param_shape_is_already_sharded:
+            red *= num_shards
+        a = te.placeholder((red, spatial), dtype=dtype)
+        w = topi.reshape(a, (num_shards, red // num_shards, spatial))
+        func = te.create_prim_func([a, w])
+        return func
+
+    def shard_axis_1(weight: relax.TensorStructInfo):
+        (spatial, red), dtype = weight.shape, weight.dtype
+        spatial, red = int(spatial), int(red)
+        if param_shape_is_already_sharded:
+            red *= num_shards
+        a = te.placeholder((spatial, red), dtype=dtype)
+        w = topi.reshape(a, (spatial, num_shards, red // num_shards))
+        w = topi.transpose(w, (1, 0, 2))
+        func = te.create_prim_func([a, w])
+        return func
+
     def shard_gate_up_weight_scale(x: relax.TensorStructInfo):
         (red, spatial), dtype = x.shape, x.dtype
         red, spatial = int(red), int(spatial)
@@ -197,6 +241,8 @@ def shard_gate_up_weight_scale(x: relax.TensorStructInfo):
         "shard_mlp_k": shard_k_weight,
         "shard_o_proj_k": shard_k_weight,
         "shard_gate_up": shard_gate_up_weight_scale,
+        "shard_axis_0": shard_axis_0,
+        "shard_axis_1": shard_axis_1,
     }
 
 
@@ -221,7 +267,7 @@ def add_to_shard_info(param_name: str, func_name: Optional[str]):
 
         shard_info_dict[param_name] = shard_info
 
-    q_params = param_manager.get_quantized_param_info("prefill").fields
+    q_params = [param.struct_info for param in param_manager.get_quantized_params("prefill")]
     for _, param in param_manager.params.items():
         if param.shard_strategy is None:
             pass
@@ -272,7 +318,7 @@ def create_shard_transformation_func(param_manager, args, model_config) -> tvm.I
             param_shape_is_already_sharded=args.build_model_only,
         )
 
-    q_params = param_manager.get_quantized_param_info("prefill").fields
+    q_params = [param.struct_info for param in param_manager.get_quantized_params("prefill")]
 
     # The order of the quantized parameters must be preserved.
     # Therefore, we need to loop over q_params and look up information
@@ -289,7 +335,7 @@ def create_shard_transformation_func(param_manager, args, model_config) -> tvm.I
             )
 
     bb = relax.BlockBuilder()  # pylint: disable=invalid-name
-    with bb.function("transform_params"):
+    with bb.function("transform_params", attrs={"num_input": 1}):
         rank = tir.SizeVar("rank", "int64")
         # TODO(Lunderberg): Support primitive inputs to relax
         # functions.  Currently, using a PrimStructInfo as the

diff --git a/mlc_llm/relax_model/llama.py b/mlc_llm/relax_model/llama.py
@@ -265,6 +265,12 @@ def __init__(self, config: LlamaConfig):
             self.gate_proj = Linear(hidden_size, intermediate_size, dtype=dtype, bias=False)
             self.down_proj = Linear(intermediate_size, hidden_size, dtype=dtype, bias=False)
             self.up_proj = Linear(hidden_size, intermediate_size, dtype=dtype, bias=False)
+            self.gate_proj.weight.shard_dim = 0
+            self.gate_proj.weight.shard_strategy = "shard_axis_0"
+            self.down_proj.weight.shard_dim = 1
+            self.down_proj.weight.shard_strategy = "shard_axis_1"
+            self.up_proj.weight.shard_dim = 0
+            self.up_proj.weight.shard_strategy = "shard_axis_0"
 
         self.act = {"silu": relax.op.nn.silu, "gelu": relax.op.nn.gelu}[config.hidden_act]
 
@@ -375,6 +381,9 @@ def __init__(self, config: LlamaConfig):
             self.q_proj.weight.shard_dim = 0
             self.k_proj.weight.shard_dim = 0
             self.v_proj.weight.shard_dim = 0
+            self.q_proj.weight.shard_strategy = "shard_axis_0"
+            self.k_proj.weight.shard_strategy = "shard_axis_0"
+            self.v_proj.weight.shard_strategy = "shard_axis_0"
 
         self.o_proj = Linear(
             self.head_dim * self.num_query_heads,
@@ -1250,7 +1259,6 @@ def emit_paged_kv_cache_op(bb: relax.BlockBuilder, config: LlamaConfig) -> None:
     num_heads = config.num_key_value_heads
     head_dim = config.hidden_size // config.num_attention_heads
 
-    # fmt: off
     @T.prim_func
     def kv_cache_transpose_append(
         var_pages: T.handle,
@@ -1269,7 +1277,11 @@ def kv_cache_transpose_append(
         page_size = T.SizeVar("page_size", "int64")
         num_pages = T.int64()
 
-        pages = T.match_buffer(var_pages, (num_pages, num_layers, 2, num_heads, page_size, head_dim), config.dtype)
+        pages = T.match_buffer(
+            var_pages,
+            (num_pages, num_layers, 2, num_heads, page_size, head_dim),
+            config.dtype,
+        )
         k_data = T.match_buffer(var_k_data, (ntoken, num_heads, head_dim), config.dtype)
         v_data = T.match_buffer(var_v_data, (ntoken, num_heads, head_dim), config.dtype)
         last_page_offset = T.match_buffer(var_last_page_offset, (nseq,), "int32")
@@ -1281,10 +1293,23 @@ def kv_cache_transpose_append(
         for global_pos, h, f in T.grid(ntoken, num_heads, head_dim):
             with T.block("k_transpose_append"):
                 vgpos, vh, vf = T.axis.remap("SSS", [global_pos, h, f])
-                seq_idx: T.int64 = T.Cast("int64", pos2seqidx[vgpos])
-                seqlen: T.int64 = T.Cast("int64", (page_table_indptr[seq_idx + 1] - page_table_indptr[seq_idx] - 1) * page_size + last_page_offset[seq_idx])
+
+                seq_idx = T.meta_var(pos2seqidx[vgpos].astype("int64"))
+                seqlen = T.meta_var(
+                    (
+                        (page_table_indptr[seq_idx + 1] - page_table_indptr[seq_idx] - 1)
+                        * page_size
+                        + last_page_offset[seq_idx]
+                    ).astype("int64")
+                )
+
                 pages[
-                    page_table_values[page_table_indptr[seq_idx] + T.floordiv(seqlen - (append_length_indptr[seq_idx + 1] - vgpos), page_size)],
+                    page_table_values[
+                        page_table_indptr[seq_idx]
+                        + T.floordiv(
+                            seqlen - (append_length_indptr[seq_idx + 1] - vgpos), page_size
+                        )
+                    ],
                     layer_id,
                     0,
                     vh,
@@ -1293,17 +1318,29 @@ def kv_cache_transpose_append(
                 ] = k_data[vgpos, vh, vf]
             with T.block("v_transpose_append"):
                 vgpos, vh, vf = T.axis.remap("SSS", [global_pos, h, f])
-                seq_idx: T.int64 = T.Cast("int64", pos2seqidx[vgpos])
-                seqlen: T.int64 = T.Cast("int64", (page_table_indptr[seq_idx + 1] - page_table_indptr[seq_idx] - 1) * page_size + last_page_offset[seq_idx])
+
+                seq_idx = T.meta_var(pos2seqidx[vgpos].astype("int64"))
+                seqlen = T.meta_var(
+                    (
+                        (page_table_indptr[seq_idx + 1] - page_table_indptr[seq_idx] - 1)
+                        * page_size
+                        + last_page_offset[seq_idx]
+                    ).astype("int64")
+                )
+
                 pages[
-                    page_table_values[page_table_indptr[seq_idx] + T.floordiv(seqlen - (append_length_indptr[seq_idx + 1] - vgpos), page_size)],
+                    page_table_values[
+                        page_table_indptr[seq_idx]
+                        + T.floordiv(
+                            seqlen - (append_length_indptr[seq_idx + 1] - vgpos), page_size
+                        )
+                    ],
                     layer_id,
                     1,
                     vh,
                     T.floormod(seqlen - (append_length_indptr[seq_idx + 1] - vgpos), page_size),
                     vf,
                 ] = v_data[vgpos, vh, vf]
-    # fmt: on
 
     bb.add_func(kv_cache_transpose_append, "kv_cache_transpose_append")
     bb.add_func(relax.extern("paged_kv_cache.attention_kernel_prefill"), "attention_prefill")
@@ -1516,7 +1553,8 @@ def get_model(args, hf_config):
             **hf_config,
             dtype=dtype,
             position_embedding_base=position_embedding_base,
-            combine_matmul=True,
+            # TODO: Re-enable with CombineParallelMatmul
+            combine_matmul=False,
             num_shards=args.num_shards,
             build_model_only=args.build_model_only,
         )
@@ -1526,7 +1564,8 @@ def get_model(args, hf_config):
             dtype=dtype,
             max_sequence_length=hf_config["max_position_embeddings"],
             position_embedding_base=position_embedding_base,
-            combine_matmul=True,
+            # TODO: Re-enable with CombineParallelMatmul
+            combine_matmul=False,
             num_shards=args.num_shards,
             build_model_only=args.build_model_only,
         )

diff --git a/mlc_llm/relax_model/llama_batched_vllm.py b/mlc_llm/relax_model/llama_batched_vllm.py
@@ -1052,7 +1052,8 @@ def get_model(args, hf_config):
             dtype=dtype,
             max_sequence_length=hf_config["max_position_embeddings"],
             position_embedding_base=position_embedding_base,
-            combine_matmul=True,
+            # combine_matmul=True,
+            combine_matmul=False,
             num_shards=args.num_shards,
             build_model_only=args.build_model_only,
             quantization_scheme=args.quantization,
@@ -1072,7 +1073,8 @@ def get_model(args, hf_config):
             **hf_config,
             dtype=dtype,
             position_embedding_base=position_embedding_base,
-            combine_matmul=True,
+            # combine_matmul=True,
+            combine_matmul=False,
             num_shards=args.num_shards,
             build_model_only=args.build_model_only,
         )
@@ -1082,7 +1084,8 @@ def get_model(args, hf_config):
             dtype=dtype,
             max_sequence_length=hf_config["max_position_embeddings"],
             position_embedding_base=position_embedding_base,
-            combine_matmul=True,
+            # combine_matmul=True,
+            combine_matmul=False,
             num_shards=args.num_shards,
             build_model_only=args.build_model_only,
         )

diff --git a/mlc_llm/relax_model/mistral.py b/mlc_llm/relax_model/mistral.py
@@ -664,7 +664,9 @@ def forward(self, input_ids: relax.Expr):
 
 
 class MistralModel(nn.Module):
-    def __init__(self, config: MistralConfig, vocab_size_var: tvm.tir.SizeVar, sep_embed: bool = False):
+    def __init__(
+        self, config: MistralConfig, vocab_size_var: tvm.tir.SizeVar, sep_embed: bool = False
+    ):
         self.num_shards = config.num_shards
         self.padding_idx = config.pad_token_id
         self.embed_tokens = None
@@ -730,7 +732,9 @@ def forward(
 
 
 class MistralForCausalLM(nn.Module):
-    def __init__(self, config: MistralConfig, vocab_size_var: tvm.tir.SizeVar, sep_embed: bool = False):
+    def __init__(
+        self, config: MistralConfig, vocab_size_var: tvm.tir.SizeVar, sep_embed: bool = False
+    ):
         self.model = MistralModel(config, vocab_size_var, sep_embed)
         self.lm_head = Linear(config.hidden_size, vocab_size_var, dtype=config.dtype, bias=False)
 
@@ -827,13 +831,13 @@ def create_encoding_func(
 
     bsz = 1
     seq_len = tvm.tir.SizeVar("n", "int64")  # number of tokens for the input
-    rolling_cache_len = tvm.tir.SizeVar("c", "int64")  # rolling_cache_len captures number of elements in the cache
+    rolling_cache_len = tvm.tir.SizeVar(
+        "c", "int64"
+    )  # rolling_cache_len captures number of elements in the cache
     kv_seq_len = tvm.tir.SizeVar(
         "k", "int64"
     )  # kv_seq_len captures number of elements in cache + seq_len
-    cache_offset = tvm.tir.SizeVar(
-        "o", "int64"
-    )  # slidinf window kv cache offset
+    cache_offset = tvm.tir.SizeVar("o", "int64")  # slidinf window kv cache offset
 
     hidden_size = config.hidden_size
     with bb.function(func_name):
@@ -888,13 +892,13 @@ def create_decoding_func(
     func_name = "decode"
 
     bsz = 1
-    rolling_cache_len = tvm.tir.SizeVar("c", "int64")  # rolling_cache_len captures number of elements in the cache
+    rolling_cache_len = tvm.tir.SizeVar(
+        "c", "int64"
+    )  # rolling_cache_len captures number of elements in the cache
     kv_seq_len = tvm.tir.SizeVar(
         "k", "int64"
     )  # kv_seq_len captures number of elements in cache + seq_len
-    cache_offset = tvm.tir.SizeVar(
-        "o", "int64"
-    )  # sliding window kv cache offset
+    cache_offset = tvm.tir.SizeVar("o", "int64")  # sliding window kv cache offset
 
     with bb.function(func_name):
         model = MistralForCausalLM(config, tvm.tir.SizeVar("vocab_size", "int64"))
@@ -992,7 +996,8 @@ def get_model(args, hf_config):
     config = MistralConfig(
         **hf_config,
         dtype=dtype,
-        combine_matmul=True,
+        # combine_matmul=True,
+        combine_matmul=False,
         num_shards=args.num_shards,
         build_model_only=args.build_model_only,
     )

diff --git a/mlc_llm/relax_model/mixtral.py b/mlc_llm/relax_model/mixtral.py
@@ -350,14 +350,14 @@ def top2_softmax_func(
                         for j in T.unroll(2):
                             with T.block("cast"):
                                 vj = T.axis.remap("S", [j])
-                                local_top_k_f32[vj] = T.cast(local_top_k[j], "float32")
+                                local_top_k_f32[vj] = T.cast(local_top_k[vj], "float32")
                         with T.block("max"):
                             local_top_k_max[0] = T.max(local_top_k_f32[0], local_top_k_f32[1])
                         for j in T.unroll(2):
                             with T.block("output"):
                                 vj = T.axis.remap("S", [j])
                                 out[vi, vj] = T.cast(
-                                    T.exp(local_top_k_f32[j] - local_top_k_max[0])
+                                    T.exp(local_top_k_f32[vj] - local_top_k_max[0])
                                     / (
                                         T.exp(local_top_k_f32[0] - local_top_k_max[0])
                                         + T.exp(local_top_k_f32[1] - local_top_k_max[0])