symint and other fixes

bnellnm · bnellnm · commit 6793994425de · 2024-08-23T02:28:25.000Z
diff --git a/tests/kernels/test_layernorm.py b/tests/kernels/test_layernorm.py
@@ -55,6 +55,13 @@ def test_rms_norm(
     else:
         torch.testing.assert_close(out, ref_out, atol=1e-2, rtol=1e-2)
 
+    if residual is not None:
+        opcheck(torch.ops._C.fused_add_rms_norm,
+                (x, residual, layer.weight.data, layer.variance_epsilon))
+    else:
+        opcheck(torch.ops._C.rms_norm,
+                (out, x, layer.weight.data, layer.variance_epsilon))
+
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@@ -119,6 +126,15 @@ def test_rms_norm_quant(
     if add_residual:
         assert torch.allclose(residual1, residual2, atol=1e-3)
 
+    if add_residual:
+        opcheck(torch.ops._C.add_residual_rms_norm_quant,
+                (out2, x_, residual2, tmp, layer.weight.data, scale2,
+                 layer.variance_epsilon))
+    else:
+        opcheck(
+            torch.ops._C.rms_norm_quant,
+            (out2, x_, tmp, layer.weight.data, scale2, layer.variance_epsilon))
+
 
 @pytest.mark.parametrize("num_tokens", NUM_TOKENS)
 @pytest.mark.parametrize("hidden_size", HIDDEN_SIZES)
@@ -180,3 +196,12 @@ def test_rms_norm_quant2(
     assert torch.allclose(out1, out2, atol=2.0)
     if add_residual:
         assert torch.allclose(residual1, residual2, atol=1e-3)
+
+    if add_residual:
+        opcheck(torch.ops._C.add_residual_rms_norm_quant,
+                (out2, x_, residual2, tmp, layer.weight.data, scale1,
+                 layer.variance_epsilon))
+    else:
+        opcheck(
+            torch.ops._C.rms_norm_quant,
+            (out2, x_, tmp, layer.weight.data, scale1, layer.variance_epsilon))
diff --git a/vllm/attention/backends/flash_attn.py b/vllm/attention/backends/flash_attn.py
@@ -121,116 +121,6 @@ def _(
     return torch.empty_like(decode_query)
 
 
-
-@torch.library.impl("vllm::flash_attn_varlen_func", "cuda")
-def _flash_attn_varlen_func(
-    out_shape,
-    q,
-    k,
-    v,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    max_seqlen_q,
-    max_seqlen_k,
-    softmax_scale,
-    causal,
-    window_size,
-    alibi_slopes,
-    block_table,
-):
-    return flash_attn_varlen_func(
-        q=q,
-        k=k,
-        v=v,
-        cu_seqlens_q=cu_seqlens_q,
-        cu_seqlens_k=cu_seqlens_k,
-        max_seqlen_q=max_seqlen_q,
-        max_seqlen_k=max_seqlen_k,
-        softmax_scale=softmax_scale,
-        causal=causal,
-        window_size=window_size,
-        alibi_slopes=alibi_slopes,
-        block_table=block_table,
-    )
-
-
-@torch.library.impl_abstract("vllm::flash_attn_varlen_func")
-def _flash_attn_varlen_func_meta(
-    out_shape,
-    q,
-    k,
-    v,
-    cu_seqlens_q,
-    cu_seqlens_k,
-    max_seqlen_q,
-    max_seqlen_k,
-    softmax_scale,
-    causal,
-    window_size,
-    alibi_slopes,
-    block_table,
-):
-    # TODO: is this always correct?
-    return torch.empty(out_shape,
-                       dtype=q.dtype,
-                       layout=q.layout,
-                       device=q.device)
-
-
-torch.library.define("vllm::flash_attn_with_kvcache", ("(int[] out_shape, "
-                                                       "Tensor q, "
-                                                       "Tensor k, "
-                                                       "Tensor v, "
-                                                       "Tensor block_table, "
-                                                       "Tensor cache_seqlens, "
-                                                       "float softmax_scale, "
-                                                       "bool causal, "
-                                                       "float[]? alibi_slopes"
-                                                       ") -> Tensor"))
-
-
-@torch.library.impl("vllm::flash_attn_with_kvcache", "cuda")
-def _flash_attn_with_kvcache(
-    out_shape,
-    decode_query,
-    key_cache,
-    value_cache,
-    block_table,
-    cache_seqlens,
-    softmax_scale,
-    causal,
-    alibi_slopes,
-):
-    return flash_attn_with_kvcache(
-        decode_query,
-        key_cache,
-        value_cache,
-        block_table=block_table,
-        cache_seqlens=cache_seqlens,
-        softmax_scale=softmax_scale,
-        causal=causal,
-        alibi_slopes=alibi_slopes,
-    )
-
-
-@torch.library.impl_abstract("vllm::flash_attn_with_kvcache")
-def _flash_attn_with_kvcache_meta(
-    out_shape,
-    decode_query,
-    key_cache,
-    value_cache,
-    block_table,
-    cache_seqlens,
-    softmax_scale,
-    causal,
-    alibi_slopes,
-):
-    return torch.empty(out_shape,
-                       dtype=decode_query.dtype,
-                       layout=decode_query.layout,
-                       device=decode_query.device)
-
-
 class FlashAttentionBackend(AttentionBackend):
 
     @staticmethod
@@ -779,7 +669,6 @@ def forward(
                 # When block_tables are not filled, it means q and k are the
                 # prompt, and they have the same length.
                 out = torch.ops.vllm.flash_attn_varlen_func(
-                    out_shape=output[:num_prefill_tokens].size(),
                     q=query,
                     k=key,
                     v=value,
@@ -817,10 +706,8 @@ def forward(
 
         if decode_meta := attn_metadata.decode_metadata:
             # Decoding run.
-            output_shape = output[num_prefill_tokens:].squeeze(1).size()
             output[
                 num_prefill_tokens:] = torch.ops.vllm.flash_attn_with_kvcache(
-                    output_shape,
                     decode_query.unsqueeze(1),
                     key_cache,
                     value_cache,
diff --git a/vllm/model_executor/model_optimizer/fused_op_generator_utils.py b/vllm/model_executor/model_optimizer/fused_op_generator_utils.py
@@ -68,27 +68,26 @@ def arg_schema_type(n: torch.fx.node.Argument,
     """
     Get the schema or C++ type for a fused op argument.
     """
-    if isinstance(n, float):
-        return "float"
-    elif isinstance(n, int):
-        return "int"
+    if n.type is not None:
+        ty = n.type.__name__
+    elif n.meta.get('type') and n.meta.get('type').__name__ != 'FakeTensor':
+        ty = n.meta.get('type').__name__
+        print(f"meta type {ty}")
+        if ty == 'Size':
+            return 'std::vector<int64_t>' if add_prefix else 'int[]'
     else:
-        if n.type is not None:
-            ty = n.type.__name__
-        elif n.meta.get(
-                'type') and n.meta.get('type').__name__ != 'FakeTensor':
-            ty = n.meta.get('type').__name__
-            if ty == 'Size':
-                return 'std::vector<int64_t> const' if add_prefix else 'int[]'
-        else:
-            # this default is a bit sketchy
-            ty = "Tensor"
+        # this default is a bit sketchy
+        ty = "Tensor"
 
     builtin_types = {"int": "int64_t", "float": "double"}
 
     if add_prefix and ty in builtin_types:
         return builtin_types[ty]
 
+    print(f"arg_schema_type {ty}")
+    if ty == "SymInt" and add_prefix:
+        return "int64_t"
+
     return ty if not add_prefix else f"torch::{ty}"
 
 
diff --git a/vllm/model_executor/model_optimizer/model_optimizer.py b/vllm/model_executor/model_optimizer/model_optimizer.py
@@ -88,7 +88,7 @@ def __init__(self, backend: Optional[str] = 'inductor'):
     def __call__(self, gm: torch.fx.GraphModule,
                  example_inputs: List[torch.Tensor]) -> Callable:
         # Temporarily disable optimizer so we can collect dynamo issues.
-        return gm
+        #return gm
 
         logger.info("Graph optimizer start")
 
diff --git a/vllm/model_executor/model_optimizer/naive_fused_op_generator.py b/vllm/model_executor/model_optimizer/naive_fused_op_generator.py
@@ -320,6 +320,7 @@ def make_fused_op(
             f"{arg_schema_type(inp, True)}" for inp in inputs.values()
         ]
         logger.debug("fused op argument types: %s", arg_types)
+        print(f"fused op argument types: {str(arg_types)}")
         for i, name in enumerate(inputs.keys()):
             # Don't use const refs here so inputs can be deleted when no
             # longer needed.

Original file line number	Diff line number	Diff line change
`@@ -320,6 +320,7 @@ def make_fused_op(`
`320`	`320`	`f"{arg_schema_type(inp, True)}" for inp in inputs.values()`
`321`	`321`	`]`
`322`	`322`	`logger.debug("fused op argument types: %s", arg_types)`
	`323`	`+ print(f"fused op argument types: {str(arg_types)}")`
`323`	`324`	`for i, name in enumerate(inputs.keys()):`
`324`	`325`	`# Don't use const refs here so inputs can be deleted when no`
`325`	`326`	`# longer needed.`