vllm-project
diff --git a/‎tests/compile/piecewise/test_full_cudagraph.py‎
Lines changed: 21 additions & 8 deletions b/‎tests/compile/piecewise/test_full_cudagraph.py‎
Lines changed: 21 additions & 8 deletions
diff --git a/‎tests/compile/piecewise/test_multiple_graphs.py‎
Lines changed: 27 additions & 11 deletions b/‎tests/compile/piecewise/test_multiple_graphs.py‎
Lines changed: 27 additions & 11 deletions
diff --git a/‎tests/compile/piecewise/test_toy_llama.py‎
Lines changed: 74 additions & 43 deletions b/‎tests/compile/piecewise/test_toy_llama.py‎
Lines changed: 74 additions & 43 deletions
diff --git a/‎tests/compile/silly_attention.py‎
Lines changed: 0 additions & 1 deletion b/‎tests/compile/silly_attention.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎tests/compile/test_decorator.py‎
Lines changed: 3 additions & 0 deletions b/‎tests/compile/test_decorator.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎vllm/attention/layer.py‎
Lines changed: 0 additions & 6 deletions b/‎vllm/attention/layer.py‎
Lines changed: 0 additions & 6 deletions
@@ -11,6 +11,7 @@
 from vllm import LLM, SamplingParams
 from vllm.config import CompilationConfig
 from vllm.platforms import current_platform
+from vllm.utils import is_torch_equal_or_newer
 
 
 @contextlib.contextmanager
@@ -32,28 +33,32 @@ def temporary_environ(env_vars):
  os.environ[k] = v
 
 
-test_params_full_cudagraph = []
+model_backends_full_cudagraph = []
 
 # deepseek-ai/DeepSeek-V2-Lite with MLA
 MLA_backends = ["FlashMLA", "FlashAttentionMLA", "CutlassMLA"]
 for mla_backend in MLA_backends:
- test_params_full_cudagraph.append(
- pytest.param(("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend]))
+ model_backends_full_cudagraph.append(
+ ("deepseek-ai/DeepSeek-V2-Lite", backend_configs[mla_backend])
  )
 
 # Qwen/Qwen2-1.5B-Instruct with other backends
 other_backend_configs = [
  backend_configs[c] for c in backend_configs if c not in MLA_backends
 ]
 for backend_config in other_backend_configs:
- test_params_full_cudagraph.append(
- pytest.param(("Qwen/Qwen2-1.5B-Instruct", backend_config))
- )
+ model_backends_full_cudagraph.append(("Qwen/Qwen2-1.5B-Instruct", backend_config))
 
 
 @pytest.fixture(scope="class")
 def llm_pair(request):
- model, backend_config = request.param
+ model, backend_config, use_inductor_graph_partition = request.param
+ backend_config.comp_config["use_inductor_graph_partition"] = (
+ use_inductor_graph_partition
+ )
+
+ if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+ pytest.skip("Inductor graph partition only supported in torch>=2.9")
 
  # Dynamically skip test if GPU capability is not met
  if (
@@ -104,7 +109,15 @@ def llm_pair(request):
  )
 
 
-@pytest.mark.parametrize("llm_pair", test_params_full_cudagraph, indirect=True)
+@pytest.mark.parametrize(
+ "llm_pair",
+ [
+ pytest.param((model, backend_config, use_inductor_graph_partition))
+ for model, backend_config in model_backends_full_cudagraph
+ for use_inductor_graph_partition in [True, False]
+ ],
+ indirect=True,
+)
 class TestFullCUDAGraph:
  """
  Use a class such that an llm pair is constructed once for all
 
@@ -5,6 +5,7 @@
 are compiled and graph captured separately.
 """
 
+import pytest
 import torch
 from torch import nn
 
@@ -190,7 +191,12 @@ def run_model(
  return output.cpu()
 
 
-def test_multi_graph_piecewise_compile_outputs_equal():
+@pytest.mark.parametrize("use_inductor_graph_partition", [False, True])
+def test_multi_graph_piecewise_compile(use_inductor_graph_partition: bool):
+ if use_inductor_graph_partition:
+ # FIXME(luka/boyuan): this currently fails
+ pytest.skip("Inductor graph partition not supported with multi-graph")
+
  outputs = []
 
  # piecewise compile
@@ -200,6 +206,7 @@ def test_multi_graph_piecewise_compile_outputs_equal():
  use_cudagraph=True,
  splitting_ops=["silly::attention"],
  cudagraph_capture_sizes=[1, 2],
+ use_inductor_graph_partition=use_inductor_graph_partition,
  )
  )
  cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
@@ -220,16 +227,24 @@ def test_multi_graph_piecewise_compile_outputs_equal():
  # static tensor addresses
  inputs = torch.randn(BATCH_SIZE, MLP_SIZE).cuda()
 
- with compilation_counter.expect(
- num_graphs_seen=2, # two graphs for the model
- num_piecewise_graphs_seen=6,
+ if use_inductor_graph_partition:
+ # Splitting happens at Inductor lowering level,
+ # total piecewise fx graphs is equal to total graphs
+ num_piecewise_fx = 2
+ num_piecewise_capturable_fx = 2
+ else:
  # attn_one, attn_two each has 3 piecewise graphs
  # (pre attn, post attn, silly_attention) each
- num_piecewise_capturable_graphs_seen=4,
+ num_piecewise_fx = 6
  # attn_one, attn_two has pre attn and post attn each, total=4
- num_backend_compilations=4, # num_piecewise_capturable_graphs_seen
- num_cudagraph_captured=8,
- # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+ num_piecewise_capturable_fx = 4
+
+ with compilation_counter.expect(
+ num_graphs_seen=2, # two graphs for the model
+ num_piecewise_graphs_seen=num_piecewise_fx,
+ num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
+ num_backend_compilations=num_piecewise_capturable_fx,
+ num_cudagraph_captured=8, # num_cudagraph_sizes * num_partitions
  ):
  outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
 
@@ -268,6 +283,7 @@ def test_multi_graph_piecewise_compile_outputs_equal():
  level=CompilationLevel.PIECEWISE,
  use_cudagraph=False,
  splitting_ops=["silly::attention"],
+ use_inductor_graph_partition=use_inductor_graph_partition,
  )
  )
  cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
@@ -286,9 +302,9 @@ def test_multi_graph_piecewise_compile_outputs_equal():
 
  with compilation_counter.expect(
  num_graphs_seen=2,
- num_piecewise_graphs_seen=6,
- num_piecewise_capturable_graphs_seen=4,
- num_backend_compilations=4,
+ num_piecewise_graphs_seen=num_piecewise_fx,
+ num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
+ num_backend_compilations=num_piecewise_capturable_fx,
  num_cudagraph_captured=0, # no cudagraph captured
  ):
  outputs.append(run_model(vllm_config, model, inputs, cudagraph_runtime_mode))
 
@@ -9,6 +9,7 @@
 initialized randomly with a fixed seed.
 """
 
+from copy import deepcopy
 from dataclasses import dataclass
 from typing import Any
 
@@ -26,6 +27,7 @@
  set_current_vllm_config,
 )
 from vllm.forward_context import BatchDescriptor, set_forward_context
+from vllm.utils import is_torch_equal_or_newer
 
 # This import automatically registers `torch.ops.silly.attention`
 from .. import silly_attention # noqa: F401
@@ -257,27 +259,13 @@ def tractable_computation(
 
 
 @torch.inference_mode
-def run_model(
- llama_config, use_compile: bool, backend: str, split_attn: bool = False
-) -> torch.Tensor:
- if use_compile:
- compilation_config = CompilationConfig(
- level=CompilationLevel.PIECEWISE,
- use_cudagraph=True,
- backend=backend,
- cudagraph_capture_sizes=[1, 2],
- )
- if split_attn:
- compilation_config.splitting_ops = ["silly::attention"]
- cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
- else:
- compilation_config = CompilationConfig(
- level=CompilationLevel.NO_COMPILATION,
- )
- cudagraph_runtime_mode = CUDAGraphMode.NONE
+def run_model(llama_config, compile_config: CompilationConfig) -> torch.Tensor:
+ # Start with a fresh copy to make sure there's no cache dir sharing
+ compile_config = deepcopy(compile_config)
+ cudagraph_runtime_mode = compile_config.cudagraph_mode
 
  vllm_config = VllmConfig(
- compilation_config=compilation_config, additional_config=llama_config
+ compilation_config=compile_config, additional_config=llama_config
  )
  with set_current_vllm_config(vllm_config):
  model = (
@@ -338,8 +326,25 @@ def run_model(
  return output.cpu()
 
 
-@pytest.mark.parametrize("backend", ["inductor", "eager"])
-def test_toy_llama(backend: str):
+@pytest.mark.parametrize(
+ "backend, use_inductor_graph_partition",
+ [
+ ("eager", False), # No inductor
+ ("inductor", False), # Inductor, Dynamo partition
+ ("inductor", True), # Inductor, Inductor partition
+ ],
+)
+def test_toy_llama(
+ backend: str, use_inductor_graph_partition: bool, monkeypatch, tmp_path
+):
+ # We disable the vLLM compile cache into a new tmp dir for 2 reasons:
+ # 1. To make sure we can properly track the number of Inductor compilations.
+ # 2. Inductor partitioning does not play nicely with Autograd cache (below)
+ monkeypatch.setenv("VLLM_DISABLE_COMPILE_CACHE", "1")
+
+ if use_inductor_graph_partition and not is_torch_equal_or_newer("2.9.0.dev"):
+ pytest.skip("Inductor graph partition only supported in torch>=2.9")
+
  # compare output with and without piecewise compilation
 
  llama_config = LlamaConfig(
@@ -350,6 +355,32 @@ def test_toy_llama(backend: str):
  hidden_size=128, mlp_size=256, vocab_size=128, num_layers=2, tractable_init=True
  )
 
+ compile_config_no_compile = CompilationConfig(
+ level=CompilationLevel.NO_COMPILATION,
+ cudagraph_mode=CUDAGraphMode.NONE,
+ backend="eager",
+ )
+
+ compile_config_no_split = CompilationConfig(
+ level=CompilationLevel.PIECEWISE,
+ use_inductor_graph_partition=use_inductor_graph_partition,
+ cudagraph_mode=CUDAGraphMode.PIECEWISE,
+ backend=backend,
+ cudagraph_capture_sizes=[1, 2],
+ )
+
+ # FIXME(luka/boyuan): the graph from the previous test case
+ # (no inductor partition) gets cached by AotAutograd so then the
+ # compilation with inductor partitioning incorrectly loads an unpartitioned
+ # graph and never partitions. I think this is a bug with custom inductor
+ # partitioning but does not affect vLLM more generally as vLLM uses its own
+ # cache (which takes inductor partitioning into account).
+ if use_inductor_graph_partition:
+ compile_config_no_split.inductor_compile_config["force_disable_caches"] = True
+
+ compile_config_split = deepcopy(compile_config_no_split)
+ compile_config_split.splitting_ops = ["silly::attention"]
+
  outputs = []
  with compilation_counter.expect(
  num_graphs_seen=0,
@@ -358,44 +389,44 @@ def test_toy_llama(backend: str):
  num_backend_compilations=0,
  num_cudagraph_captured=0,
  ):
- outputs.append(run_model(llama_config, backend="eager", use_compile=False))
- run_model(tractable_config, backend="eager", use_compile=False)
+ outputs.append(run_model(llama_config, compile_config_no_compile))
+
+ run_model(tractable_config, compile_config_no_compile)
 
  if backend == "inductor":
  kwargs = {"num_inductor_compiles": 1, "num_eager_compiles": 0}
  else:
  kwargs = {"num_eager_compiles": 1, "num_inductor_compiles": 0}
 
  with compilation_counter.expect(
- # One graph for the model
- num_graphs_seen=1,
+ num_graphs_seen=1, # one graph for the model
  num_piecewise_graphs_seen=1,
  num_piecewise_capturable_graphs_seen=1,
- # num_piecewise_capturable_graphs_seen
- num_backend_compilations=1,
- # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+ num_backend_compilations=1, # num_piecewise_capturable_graphs_seen
  num_cudagraph_captured=2,
  **kwargs,
  ):
- outputs.append(run_model(llama_config, backend=backend, use_compile=True))
- run_model(tractable_config, backend=backend, use_compile=True)
+ outputs.append(run_model(llama_config, compile_config_no_split))
+
+ run_model(tractable_config, compile_config_no_split)
+
+ if use_inductor_graph_partition:
+ num_piecewise_fx = 1
+ num_piecewise_capturable_fx = 1
+ else:
+ num_piecewise_fx = 2 * llama_config.num_layers + 1
+ num_piecewise_capturable_fx = 1 + llama_config.num_layers
 
  with compilation_counter.expect(
  num_graphs_seen=1, # one graph for the model
- num_piecewise_graphs_seen=2 * llama_config.num_layers + 1, # 2 * num_layers + 1
- num_piecewise_capturable_graphs_seen=1
- + llama_config.num_layers, # 1 + num_layers
- num_backend_compilations=1
- + llama_config.num_layers, # num_piecewise_capturable_graphs_seen
- num_cudagraph_captured=2
- * (
- 1 + llama_config.num_layers
- ), # num_cudagraph_sizes * num_piecewise_capturable_graphs_seen
+ num_piecewise_graphs_seen=num_piecewise_fx,
+ num_piecewise_capturable_graphs_seen=num_piecewise_capturable_fx,
+ num_backend_compilations=num_piecewise_capturable_fx,
+ # num_cudagraph_sizes * num_partitions
+ num_cudagraph_captured=2 * (1 + llama_config.num_layers),
  ):
- outputs.append(
- run_model(llama_config, backend=backend, use_compile=True, split_attn=True)
- )
- run_model(tractable_config, backend=backend, use_compile=True, split_attn=True)
+ outputs.append(run_model(llama_config, compile_config_split))
+ run_model(tractable_config, compile_config_split)
 
  for i in range(1, len(outputs)):
  assert torch.allclose(outputs[0], outputs[i])
 
@@ -62,5 +62,4 @@ def silly_attention_fake(
  mutates_args=["out"],
  fake_impl=silly_attention_fake,
  target_lib=silly_lib,
- tags=(torch._C.Tag.cudagraph_unsafe,),
 )
@@ -73,6 +73,7 @@ def test_ignore_torch_compile_decorator():
  use_cudagraph=True,
  splitting_ops=["silly::attention"],
  cudagraph_capture_sizes=[1, 2],
+ use_inductor_graph_partition=False, # TODO test both?
  )
  )
  cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
@@ -188,6 +189,7 @@ def test_conditional_compile_enable_if():
  use_cudagraph=True,
  splitting_ops=["silly::attention"],
  cudagraph_capture_sizes=[1, 2],
+ use_inductor_graph_partition=False, # TODO test both
  ),
  )
  cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE
@@ -220,6 +222,7 @@ def test_conditional_compile_enable_if():
  use_cudagraph=True,
  splitting_ops=["silly::attention"],
  cudagraph_capture_sizes=[1, 2],
+ use_inductor_graph_partition=False, # TODO test both?
  ),
  )
 
 
@@ -38,10 +38,6 @@
 
 logger = init_logger(__name__)
 USE_XFORMERS_OPS = None
-try:
- tag_cudagraph_unsafe = (torch._C.Tag.cudagraph_unsafe,)
-except AttributeError:
- tag_cudagraph_unsafe = () # type: ignore[assignment]
 
 
 def check_xformers_availability():
@@ -879,7 +875,6 @@ def unified_attention_fake(
  op_name="unified_attention",
  op_func=unified_attention,
  fake_impl=unified_attention_fake,
- tags=tag_cudagraph_unsafe,
 )
 
 
@@ -931,7 +926,6 @@ def unified_attention_with_output_fake(
  op_func=unified_attention_with_output,
  mutates_args=["output", "output_block_scale"],
  fake_impl=unified_attention_with_output_fake,
- tags=tag_cudagraph_unsafe,
 )
Original file line number	Diff line number	Diff line change
`@@ -62,5 +62,4 @@ def silly_attention_fake(`
`62`	`62`	`mutates_args=["out"],`
`63`	`63`	`fake_impl=silly_attention_fake,`
`64`	`64`	`target_lib=silly_lib,`
`65`		`- tags=(torch._C.Tag.cudagraph_unsafe,),`
`66`	`65`	`)`
Original file line number	Diff line number	Diff line change
`@@ -73,6 +73,7 @@ def test_ignore_torch_compile_decorator():`
`73`	`73`	`use_cudagraph=True,`
`74`	`74`	`splitting_ops=["silly::attention"],`
`75`	`75`	`cudagraph_capture_sizes=[1, 2],`
	`76`	`+ use_inductor_graph_partition=False, # TODO test both?`
`76`	`77`	`)`
`77`	`78`	`)`
`78`	`79`	`cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE`
`@@ -188,6 +189,7 @@ def test_conditional_compile_enable_if():`
`188`	`189`	`use_cudagraph=True,`
`189`	`190`	`splitting_ops=["silly::attention"],`
`190`	`191`	`cudagraph_capture_sizes=[1, 2],`
	`192`	`+ use_inductor_graph_partition=False, # TODO test both`
`191`	`193`	`),`
`192`	`194`	`)`
`193`	`195`	`cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE`
`@@ -220,6 +222,7 @@ def test_conditional_compile_enable_if():`
`220`	`222`	`use_cudagraph=True,`
`221`	`223`	`splitting_ops=["silly::attention"],`
`222`	`224`	`cudagraph_capture_sizes=[1, 2],`
	`225`	`+ use_inductor_graph_partition=False, # TODO test both?`
`223`	`226`	`),`
`224`	`227`	`)`
`225`	`228`