vllm-project
diff --git a/‎docs/design/moe_kernel_features.md‎
Lines changed: 1 addition & 2 deletions b/‎docs/design/moe_kernel_features.md‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎tests/kernels/moe/modular_kernel_tools/mk_objects.py‎
Lines changed: 0 additions & 17 deletions b/‎tests/kernels/moe/modular_kernel_tools/mk_objects.py‎
Lines changed: 0 additions & 17 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/__init__.py‎
Lines changed: 0 additions & 4 deletions b/‎vllm/model_executor/layers/fused_moe/__init__.py‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py‎
Lines changed: 0 additions & 180 deletions b/‎vllm/model_executor/layers/fused_moe/batched_triton_or_deep_gemm_moe.py‎
Lines changed: 0 additions & 180 deletions
diff --git a/‎vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py‎
Lines changed: 45 additions & 14 deletions b/‎vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py‎
Lines changed: 45 additions & 14 deletions
@@ -90,7 +90,6 @@ To be used with a particular `FusedMoEPrepareAndFinalize` subclass, MoE kernels
 | cutlass_fp8 | standard,</br>batched | fp8 | A,T | silu, gelu | Y | Y | [`cutlass_moe_fp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.cutlass_moe_fp8],</br>[`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],</br>[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] |
 | flashinfer | standard | nvfp4,</br>fp8 | T | <sup>5</sup> | N | Y | [`flashinfer_cutlass_moe_fp4`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.flashinfer_cutlass_moe_fp4],</br>[`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] |
 | gpt oss triton | standard | N/A | N/A | <sup>5</sup> | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],</br>[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.gpt_oss_triton_kernels_moe.OAITritonExperts] |
-| deep gemm+triton<sup>2</sup> | standard,</br>batched | all<sup>1</sup> | G(128),A,T | silu, gelu | <sup>6</sup> | Y | [`TritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe.TritonOrDeepGemmExperts],</br>[`BatchedTritonOrDeepGemmExperts`][vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe.BatchedTritonOrDeepGemmExperts] |
 | marlin | standard,</br>batched | <sup>3</sup> / N/A | <sup>3</sup> / N/A | silu,</br>swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],</br>[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],</br>[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] |
 | trtllm | standard | mxfp4,</br>nvfp4 | G(16),G(32) | <sup>5</sup> | N | Y | [`TrtLlmGenExperts`][vllm.model_executor.layers.fused_moe.trtllm_moe.TrtLlmGenExperts] |
 | pallas | standard | N/A | N/A | silu | N | N | [`fused_moe`][vllm.model_executor.layers.fused_moe.moe_pallas.fused_moe] |
@@ -114,5 +113,5 @@ The following table shows "families" of modular kernels that are intended to wor
 | backend | `FusedMoEPrepareAndFinalize` subclasses | `FusedMoEPermuteExpertsUnpermute` subclasses |
 |---------|-----------------------------------------|----------------------------------------------|
 | deepep_high_throughput | `DeepEPHTPrepareAndFinalize` | `DeepGemmExperts`,</br>`TritonExperts`,</br>`TritonOrDeepGemmExperts`,</br>`CutlassExpertsFp8`, </br>`MarlinExperts` |
-| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`BatchedTritonOrDeepGemmExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
+| deepep_low_latency,</br>pplx | `DeepEPLLPrepareAndFinalize`,</br>`PplxPrepareAndFinalize` | `BatchedDeepGemmExperts`,</br>`BatchedTritonExperts`,</br>`CutlassBatchedExpertsFp8`,</br>`BatchedMarlinExperts` |
 | flashinfer | `FlashInferCutlassMoEPrepareAndFinalize` | `FlashInferExperts` |
@@ -13,9 +13,6 @@
 from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
  BatchedDeepGemmExperts,
 )
-from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import (
- BatchedTritonOrDeepGemmExperts,
-)
 from vllm.model_executor.layers.fused_moe.config import (
  FusedMoEConfig,
  FusedMoEQuantConfig,
@@ -286,16 +283,6 @@ def expert_info(kind) -> ExpertInfo:
  needs_matching_quant=False,
  needs_deep_gemm=True,
  )
- register_experts(
- BatchedTritonOrDeepGemmExperts,
- batched_format,
- common_float_and_int_types,
- blocked_quantization_support=True,
- supports_chunking=False,
- supports_expert_map=False,
- needs_matching_quant=True,
- needs_deep_gemm=True,
- )
  register_experts(
  TritonOrDeepGemmExperts,
  standard_format,
@@ -457,10 +444,6 @@ def make_fused_experts(
  kwargs = batch_kwargs | quant_kwargs
  print(f"Making BatchedTritonExperts {kwargs} ...")
  experts = BatchedTritonExperts(**kwargs)
- elif fused_experts_type == BatchedTritonOrDeepGemmExperts:
- kwargs = batch_kwargs | quant_kwargs | deepgemm_kwargs
- print(f"Making BatchedTritonOrDeepGemmExperts {kwargs} ...")
- experts = BatchedTritonOrDeepGemmExperts(**kwargs)
  elif fused_experts_type == DeepGemmExperts:
  print(f"Making DeepGemmExperts {quant_config} ...")
  experts = DeepGemmExperts(quant_config)
 
@@ -60,9 +60,6 @@ def get_config() -> dict[str, Any] | None:
  from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
  BatchedDeepGemmExperts,
  )
- from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501
- BatchedTritonOrDeepGemmExperts,
- )
  from vllm.model_executor.layers.fused_moe.cutlass_moe import (
  CutlassBatchedExpertsFp8,
  CutlassExpertsFp8,
@@ -98,7 +95,6 @@ def get_config() -> dict[str, Any] | None:
  "DeepGemmExperts",
  "BatchedDeepGemmExperts",
  "TritonOrDeepGemmExperts",
- "BatchedTritonOrDeepGemmExperts",
  ]
 else:
  # Some model classes directly use the custom ops. Add placeholders
 
@@ -90,8 +90,10 @@
 from vllm.scalar_type import scalar_types
 from vllm.utils.deep_gemm import (
  get_col_major_tma_aligned_tensor,
+ get_mk_alignment_for_contiguous_layout,
  is_deep_gemm_e8m0_used,
 )
+from vllm.utils.import_utils import has_deep_gemm
 
 logger = init_logger(__name__)
 
@@ -1088,39 +1090,68 @@ def select_gemm_impl(
 
  return experts
 
- # triton path
- from vllm.model_executor.layers.fused_moe.batched_triton_or_deep_gemm_moe import ( # noqa: E501
- BatchedTritonOrDeepGemmExperts,
+ from vllm.model_executor.layers.fused_moe.batched_deep_gemm_moe import (
+ BatchedDeepGemmExperts,
+ )
+ from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
+ BatchedTritonExperts,
  )
  from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
  TritonOrDeepGemmExperts,
  )
 
  assert not self.rocm_aiter_moe_enabled and not self.use_marlin
 
+ use_deep_gemm = envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
+
  if (
  prepare_finalize.activation_format
  == FusedMoEActivationFormat.BatchedExperts
  ):
  max_num_tokens_per_rank = prepare_finalize.max_num_tokens_per_rank()
  assert max_num_tokens_per_rank is not None
 
- logger.debug("BatchedTritonExperts(%s)", self.__class__.__name__)
- return BatchedTritonOrDeepGemmExperts(
- max_num_tokens=max_num_tokens_per_rank,
- num_dispatchers=prepare_finalize.num_dispatchers(),
- quant_config=self.moe_quant_config,
- allow_deep_gemm=(
- envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
- ),
+ if use_deep_gemm and not has_deep_gemm():
+ raise RuntimeError(
+ "DeepGEMM requested for MoE layer but not installed."
+ )
+
+ compatible_with_deep_gemm = (
+ self.moe_quant_config.use_fp8_w8a8
+ and self.moe_quant_config.block_shape
+ == get_mk_alignment_for_contiguous_layout()
  )
+
+ # If this MoE layer is compatible with DeepGEMM, the proper env
+ # vars are set and DeepGEMM is not installed, throw an error.
+ if use_deep_gemm and compatible_with_deep_gemm and not has_deep_gemm():
+ raise RuntimeError(
+ f"MoE layer incompatible with DeepGEMM, expected "
+ f"fp8==True, got {self.moe_quant_config.use_fp8_w8a8}"
+ f"or block_shape {self.moe_quant_config.block_shape}"
+ f"=={get_mk_alignment_for_contiguous_layout()}."
+ )
+
+ if use_deep_gemm and compatible_with_deep_gemm and has_deep_gemm():
+ logger.debug("BatchedDeepGemmExperts(%s)", self.__class__.__name__)
+ return BatchedDeepGemmExperts(
+ max_num_tokens=max_num_tokens_per_rank,
+ num_dispatchers=prepare_finalize.num_dispatchers(),
+ quant_config=self.moe_quant_config,
+ )
+ else:
+ logger.debug("BatchedTritonExperts(%s)", self.__class__.__name__)
+ return BatchedTritonExperts(
+ max_num_tokens=max_num_tokens_per_rank,
+ num_dispatchers=prepare_finalize.num_dispatchers(),
+ quant_config=self.moe_quant_config,
+ )
+
  else:
  logger.debug("TritonOrDeepGemmExperts(%s)", self.__class__.__name__)
  return TritonOrDeepGemmExperts(
  self.moe_quant_config,
- allow_deep_gemm=(
- envs.VLLM_USE_DEEP_GEMM and envs.VLLM_MOE_USE_DEEP_GEMM
- ),
+ allow_deep_gemm=use_deep_gemm,
  )
 
  def get_fused_moe_quant_config(