vllm-project
diff --git a/‎csrc/torch_bindings.cpp‎
Lines changed: 6 additions & 1 deletion b/‎csrc/torch_bindings.cpp‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎vllm/compilation/backends.py‎
Lines changed: 124 additions & 12 deletions b/‎vllm/compilation/backends.py‎
Lines changed: 124 additions & 12 deletions
diff --git a/‎vllm/compilation/collective_fusion.py‎
Lines changed: 6 additions & 22 deletions b/‎vllm/compilation/collective_fusion.py‎
Lines changed: 6 additions & 22 deletions
diff --git a/‎vllm/compilation/fx_utils.py‎
Lines changed: 25 additions & 0 deletions b/‎vllm/compilation/fx_utils.py‎
Lines changed: 25 additions & 0 deletions
@@ -377,7 +377,12 @@ TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) {
       "bool silu_activation,"
       "int pad_slot_id) -> ()");
   ops.impl("causal_conv1d_fwd", torch::kCUDA, &causal_conv1d_fwd);
-#endif
+
+  ops.def("cublas_gemm_rs() -> Tensor");
+  ops.impl("cublas_gemm_rs", torch::kCUDA, &cublas_gemm_rs);
+  ops.def("cublas_ag_gemm() -> Tensor");
+  ops.impl("cublas_ag_gemm", torch::kCUDA, &cublas_ag_gemm);
+#endif  // !USE_ROCM
 
   // Quantized GEMM for GPTQ.
   // Note: even though the C++ inferred schema is correct for this op, it seems
 
@@ -26,8 +26,125 @@
 logger = init_logger(__name__)
 
 
-def pprint(x):
-    pass
+class InductorHashCache:
+    """
+    Disk format: a Python list of tuples, each tuple is
+    (runtime_shape, graph_index, hash_str)
+    We use list of tuple for readability.
+
+    In-memory format: a defaultdict of dict, where the key is
+    runtime_shape, and the value is a dict of graph_index to hash_str.
+
+    The data is essentially `Dict[Optional[int], Dict[int, str]]`,
+    we don't use json here because json doesn't support int as key.
+
+    TODO: better off-the-shelf solution to serialize the data?
+    """
+
+    def __init__(self, cache_dir: str, disabled: bool = False):
+        self.cache: defaultdict = defaultdict(dict)
+        self.disabled = disabled
+        self.cache_dir = cache_dir
+        self.cache_file_path = os.path.join(cache_dir,
+                                            "inductor_hash_cache.py")
+        if disabled:
+            return
+        # set flags so that Inductor and Triton store their cache
+        # in the cache_dir, then users only need to copy the cache_dir
+        # to another machine to reuse the cache.
+        inductor_cache = os.path.join(cache_dir, "inductor_cache")
+        os.makedirs(inductor_cache, exist_ok=True)
+        os.environ["TORCHINDUCTOR_CACHE_DIR"] = inductor_cache
+        triton_cache = os.path.join(cache_dir, "triton_cache")
+        os.makedirs(triton_cache, exist_ok=True)
+        os.environ["TRITON_CACHE_DIR"] = triton_cache
+        if os.path.exists(self.cache_file_path):
+            with open(self.cache_file_path) as f:
+                self.deserialize(f.read())
+
+    def deserialize(self, data: str):
+        # we use ast.literal_eval to parse the data
+        # because it is a safe way to parse Python literals.
+        # do not use eval(), it is unsafe.
+        try:
+            list_data = ast.literal_eval(data)
+            for runtime_shape, graph_index, hash_str in list_data:
+                self.cache[runtime_shape][graph_index] = hash_str
+        except Exception as ex:
+            logger.warning("Unable to read cache: %s, error: %s", self.cache_file_path, ex)
+            self.cache.clear()
+            self.disabled = True
+
+    def serialize(self) -> str:
+        data = []
+        for runtime_shape, graph_index_to_hash_str in self.cache.items():
+            for graph_index, hash_str in graph_index_to_hash_str.items():
+                data.append((runtime_shape, graph_index, hash_str))
+        printer = pprint.PrettyPrinter(indent=4)
+        return printer.pformat(data)
+
+    def save_to_file(self):
+        if self.disabled:
+            return
+        with open(self.cache_file_path, "w") as f:
+            f.write(self.serialize())
+
+    def __contains__(self, key: Tuple[Optional[int], int]) -> bool:
+        if self.disabled:
+            return False
+        runtime_shape, graph_index = key
+        return runtime_shape in self.cache and graph_index in self.cache[
+            runtime_shape]
+
+    def __getitem__(self, key: Tuple[Optional[int], int]) -> str:
+        if self.disabled:
+            raise KeyError("cannot read from disabled cache")
+        runtime_shape, graph_index = key
+        return self.cache[runtime_shape][graph_index]
+
+    def __setitem__(self, key: Tuple[Optional[int], int], value: str):
+        # setitem for disabled cache is fine, because we
+        # don't actually write to the disk
+        runtime_shape, graph_index = key
+        self.cache[runtime_shape][graph_index] = value
+
+
+class AlwaysHitShapeEnv:
+    """
+    Why do we need this class:
+
+    For normal `torch.compile` usage, every compilation will have
+    one Dynamo bytecode compilation and one Inductor compilation.
+    The Inductor compilation happens under the context of the
+    Dynamo bytecode compilation, and that context is used to
+    determine the dynamic shape information, etc.
+
+    For our use case, we only run Dynamo bytecode compilation once,
+    and run Inductor compilation multiple times with different shapes
+    plus a general shape. The compilation for specific shapes happens
+    outside of the context of the Dynamo bytecode compilation. At that
+    time, we don't have shape environment to provide to Inductor, and
+    it will fail the Inductor code cache lookup.
+
+    By providing a dummy shape environment that always hits, we can
+    make the Inductor code cache lookup always hit, and we can
+    compile the graph for different shapes as needed.
+
+    The following dummy methods are obtained by trial-and-error
+    until it works.
+    """
+
+    def __init__(self) -> None:
+        self.guards: List[Any] = []
+
+    def evaluate_guards_expression(self, *args, **kwargs):
+        return True
+
+    def get_pruned_guards(self, *args, **kwargs):
+        return []
+
+    def produce_guards_expression(self, *args, **kwargs):
+        return ""
 
 
 def wrap_inductor(graph: fx.GraphModule,
@@ -369,6 +486,7 @@ def configure_post_pass(self):
         inductor_config[PASS_KEY] = self.post_grad_pass_manager
 
     def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
+
         # when dynamo calls the backend, it means the bytecode
         # transform and analysis are done
         compilation_counter.num_graphs_seen += 1
@@ -385,16 +503,16 @@ def __call__(self, graph: fx.GraphModule, example_inputs) -> Callable:
         self.configure_post_pass()
 
         if ("before_split_graph"
-                in self.compilation_configs.pass_config.dump_graph_stages):
-            dump_graph(self.compilation_configs.pass_config, graph.graph,
+                in self.compilation_config.pass_config.dump_graph_stages):
+            dump_graph(self.compilation_config.pass_config, graph.graph,
                        "before_split_graph")
 
         self.split_gm, self.piecewise_graphs = split_graph(
             graph, self.compilation_config.splitting_ops)
 
         if ("after_split_graph"
-                in self.compilation_configs.pass_config.dump_graph_stages):
-            dump_graph(self.compilation_configs.pass_config,
+                in self.compilation_config.pass_config.dump_graph_stages):
+            dump_graph(self.compilation_config.pass_config,
                        self.split_gm.graph, "after_split_graph")
 
         compilation_counter.num_piecewise_graphs_seen += len(
@@ -541,13 +659,11 @@ def __call__(self, *args) -> Any:
         if not self.first_run_finished:
             self.first_run_finished = True
             self.check_for_ending_compilation()
-            pprint(f"RUN GENERAL 1")
             return self.compiled_graph_for_general_shape(*args)
 
         runtime_shape = args[self.sym_shape_indices[0]]
         if runtime_shape not in self.concrete_size_entries:
             # we don't need to do anything for this shape
-            pprint(f"RUN GENERAL 2 - {runtime_shape}")
             return self.compiled_graph_for_general_shape(*args)
 
         entry = self.concrete_size_entries[runtime_shape]
@@ -574,7 +690,6 @@ def __call__(self, *args) -> Any:
                 self.check_for_ending_compilation()
 
         if not entry.use_cudagraph:
-            pprint(f"RUN STATIC {runtime_shape}")
             return entry.runnable(*args)
 
         if entry.cudagraph is None:
@@ -586,7 +701,6 @@ def __call__(self, *args) -> Any:
                         entry.num_finished_warmup,
                         self.compilation_config.cudagraph_num_of_warmups,
                         runtime_shape)
-                pprint(f"RUN STATIC CUDAGRAPH WARMUP 1 {runtime_shape}")
                 return entry.runnable(*args)
 
             if self.is_first_graph:
@@ -617,7 +731,6 @@ def __call__(self, *args) -> Any:
                 # mind-exploding: carefully manage the reference and memory.
                 with torch.cuda.graph(cudagraph, pool=self.graph_pool):
                     # `output` is managed by pytorch's cudagraph pool
-                    pprint(f"RUN STATIC CUDAGRAPH WARMUP 2 {runtime_shape}")
                     output = entry.runnable(*args)
                     if self.is_last_graph:
                         # by converting it to weak ref,
@@ -649,6 +762,5 @@ def __call__(self, *args) -> Any:
                 f" Expected {entry.input_addresses}, got {new_input_addresses}"
             )
 
-        pprint(f"RUN STATIC CUDAGRAPH REPLAY {runtime_shape}")
         entry.cudagraph.replay()
         return entry.output
@@ -7,8 +7,8 @@
                                              fwd_only, register_replacement)
 
 import vllm.envs as envs
-from vllm.compilation.utils import (find_auto_fn, find_fn, find_getitem,
-                                    find_op, last_node_in_match)
+from vllm.compilation.fx_utils import (find_auto_fn, find_fn, find_getitem,
+                                       find_op, last_node_in_match)
 from vllm.config import CompilationConfig
 from vllm.distributed import (tensor_model_parallel_all_gather,
                               tensor_model_parallel_all_reduce)
@@ -19,6 +19,8 @@
 
 from .inductor_pass import get_pass_context
 from .vllm_inductor_pass import VllmInductorPass
+from .utils import use_cc_kernels
+
 
 logger = init_logger(__name__)
 
@@ -32,21 +34,11 @@
         logger.info("Attempting to use flux but flux not installed.")
         use_flux = False
 
-# Depends on arch, see auto_tile_shape in include/flux/gemm_hparams.h
-# Can be 256 on sm80.
-FLUX_TILE_SIZE: int = 128
-
 
 def get_world_name() -> str:
     return torch.distributed.group.WORLD.group_name
 
 
-def use_cc_kernels(m_shape: int) -> bool:
-    n_slices = get_tensor_model_parallel_world_size()
-    return (m_shape % (FLUX_TILE_SIZE * n_slices) == 0
-            and m_shape >= FLUX_TILE_SIZE * n_slices)
-
-
 def residual_slice_shape(residual: torch.Tensor, rank: int) -> int:
     n_slices = get_tensor_model_parallel_world_size()
     assert residual.size(0) % n_slices == 0
@@ -79,7 +71,7 @@ def match_gemm_rs_ag_gemm(
     return mm_2, new_residual
 
 
-def get_gemm_rs_ag_gemm(use_flux: bool, max_m: int, gemm_1_type: torch.dtype,
+def get_gemm_rs_ag_gemm(max_m: int, gemm_1_type: torch.dtype,
                         gemm_1_weights: torch.Size, gemm_2_type: torch.dtype,
                         gemm_2_weights: torch.Size,
                         tp_group_name: str,
@@ -213,7 +205,6 @@ def gemm_rs_ag_gemm_static(
             rms_norm_weights: torch.Tensor, gemm_2_weights: torch.Tensor,
             first_layer: bool,
     ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-        print(f"START STATIC FLUX {residual.shape} {first_layer}")
         if first_layer:
             slice_shape = residual_slice_shape(residual, rank)
             residual_chunk = torch.ops.aten.split.Tensor(residual, slice_shape)
@@ -237,8 +228,6 @@ def gemm_rs_ag_gemm_static(
 
         mm_2 = ag_gemm(output, gemm_2_weights)
 
-        print(f"END STATIC FLUX {residual.shape} {first_layer}")
-
         return mm_2, new_residual, slice_scatter
 
     def gemm_rs_ag_gemm_fake(
@@ -304,14 +293,12 @@ def match_final(
 def gemm_ag_final(my_residual: torch.Tensor, gemm_1_weights: torch.Tensor,
                   gemm_1_activations: torch.Tensor,
                   rms_norm_weights: torch.Tensor) -> torch.Tensor:
-    # TODO: use ag gemm here?
     mm_1 = torch.ops.aten.mm.default(gemm_1_activations,
                                      gemm_1_weights.transpose(1, 0))
 
     reduced = tensor_model_parallel_all_reduce(mm_1)
 
     if use_cc_kernels(reduced.size(0)):
-        print(f"ALL GATHER {my_residual.size()}, {reduced.size()}")
         wait_tensor = tensor_model_parallel_all_gather(my_residual)
     else:
         assert reduced.size() == my_residual.size()
@@ -322,15 +309,12 @@ def gemm_ag_final(my_residual: torch.Tensor, gemm_1_weights: torch.Tensor,
                                             weight=rms_norm_weights,
                                             epsilon=1e-05)
 
-    print(f"DONE FINAL {my_residual.size()}, {reduced.size()}")
-
     return reduced
 
 
 def gemm_ag_final_static(my_residual: torch.Tensor, gemm_1_weights: torch.Tensor,
                          gemm_1_activations: torch.Tensor,
                          rms_norm_weights: torch.Tensor) -> torch.Tensor:
-    # TODO: use ag gemm here?
     mm_1 = torch.ops.aten.mm.default(gemm_1_activations,
                                      gemm_1_weights.transpose(1, 0))
 
@@ -507,7 +491,7 @@ def find_min_index(match: Match) -> int:
                 tp_group_name = ar_node.args[1]
 
                 fused_gemm_func, fused_gemm_fake_func = get_gemm_rs_ag_gemm(
-                    use_flux, max_m, gemm_1.dtype, gemm_1.shape, gemm_2.dtype,
+                    max_m, gemm_1.dtype, gemm_1.shape, gemm_2.dtype,
                     gemm_2.shape, tp_group_name, self.is_static_shape())
 
                 fused_node = graph.call_function(fused_gemm_func,
 
@@ -4,12 +4,27 @@
 from torch import fx
 from torch._higher_order_ops.auto_functionalize import auto_functionalized
 from torch._ops import OpOverload
+from torch._inductor.pattern_matcher import Match
 
 
 def is_func(node: fx.Node, target) -> bool:
     return node.op == "call_function" and node.target == target
 
 
+def find_fn(nodes: Iterable[fx.Node], op) -> Optional[fx.Node]:
+    for node in nodes:
+        if node.op == "call_function" and node.target == op:
+            return node
+    return None
+
+
+def find_op(nodes: Iterable[fx.Node], op: str) -> Optional[fx.Node]:
+    for node in nodes:
+        if node.op == op:
+            return node
+    return None
+
+
 # Returns the first auto_functionalized node with the given op (if it exists)
 def find_auto_fn_maybe(nodes: Iterable[fx.Node],
                        op: OpOverload) -> Optional[fx.Node]:
@@ -40,3 +55,13 @@ def find_getitem(node: fx.Node, idx: int) -> fx.Node:
     ret = find_getitem_maybe(node, idx)
     assert ret is not None, f"Could not find getitem {idx} in node {node}"
     return ret
+
+
+def last_node_in_match(match: Match) -> fx.Node:
+    if len(match.nodes) > 0:
+        graph = match.nodes[0].graph
+        for n in reversed(graph.nodes):
+            if n in reversed(match.nodes):
+                return n
+    raise ValueError("No nodes in graph")
+