vllm-project
diff --git a/‎vllm/model_executor/layers/fused_moe/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎vllm/model_executor/layers/fused_moe/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/cutlass_moe.py‎
Lines changed: 1 addition & 47 deletions b/‎vllm/model_executor/layers/fused_moe/cutlass_moe.py‎
Lines changed: 1 addition & 47 deletions
diff --git a/‎vllm/model_executor/layers/fused_moe/fused_marlin_moe.py‎
Lines changed: 1 addition & 20 deletions b/‎vllm/model_executor/layers/fused_moe/fused_marlin_moe.py‎
Lines changed: 1 addition & 20 deletions
diff --git a/‎vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py‎
Lines changed: 5 additions & 4 deletions b/‎vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe.py‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎vllm/model_executor/models/registry.py‎
Lines changed: 0 additions & 1 deletion b/‎vllm/model_executor/models/registry.py‎
Lines changed: 0 additions & 1 deletion
@@ -93,6 +93,7 @@ def get_config() -> dict[str, Any] | None:
  "cutlass_moe_fp4",
  "CutlassExpertsFp8",
  "CutlassBatchedExpertsFp8",
+ "CutlassExpertsW4A8Fp8",
  "TritonExperts",
  "BatchedTritonExperts",
  "DeepGemmExperts",
 
@@ -27,19 +27,6 @@
 logger = init_logger(__name__)
 
 
-# print utilities
-def print_args_info(*args, **kwargs):
- print("=== positional args ===")
- for i, a in enumerate(args):
- print(f"\narg[{i}]:")
- # print tensor info only
- if isinstance(a, torch.Tensor):
- print(f" shape : {tuple(a.shape)}")
- print(f" stride: {tuple(a.stride())}")
- print(f" dtype : {a.dtype}")
- print(f" device: {a.device}")
-
-
 def run_cutlass_moe_fp8(
  output: torch.Tensor,
  hidden_states: torch.Tensor,
@@ -207,24 +194,6 @@ def run_cutlass_moe_fp8(
  # this rank handles only partial tokens, or when it is batched .
  mm1_out.fill_(0)
 
- # print(f'Printing information for first moe call')
- # print_args_info(
- # mm1_out,
- # a1q,
- # w1,
- # a1q_scale,
- # w1_scale,
- # expert_offsets,
- # problem_sizes1,
- # ab_strides1,
- # ab_strides1,
- # c_strides1,
- # per_act_token,
- # per_out_ch,
- # )
- # print problem shapes and stuff
- # print(f'{problem_sizes1=}')
- # print(f'{expert_offsets=}')
  ops.cutlass_moe_mm(
  mm1_out,
  a1q,
@@ -248,22 +217,7 @@ def run_cutlass_moe_fp8(
 
  if expert_map is not None:
  mm2_out.fill_(0)
- # print('=========================')
- # print(f'Printing information for second moe call...')
- # print_args_info(
- # mm2_out,
- # a2q,
- # w2,
- # a2q_scale,
- # w2_scale,
- # expert_offsets,
- # problem_sizes2,
- # ab_strides2,
- # ab_strides2,
- # c_strides2,
- # per_act_token,
- # per_out_ch,
- # )
+
  ops.cutlass_moe_mm(
  mm2_out,
  a2q,
 
@@ -29,19 +29,6 @@
 from vllm.scalar_type import ScalarType, scalar_types
 
 
-# print utilities
-def print_args_info(*args, **kwargs):
- print("=== positional args ===")
- for i, a in enumerate(args):
- print(f"\narg[{i}]:")
- # print tensor info only
- if isinstance(a, torch.Tensor):
- print(f" shape : {tuple(a.shape)}")
- print(f" stride: {tuple(a.stride())}")
- print(f" dtype : {a.dtype}")
- print(f" device: {a.device}")
-
-
 def default_activation_func(
  activation: str, output: torch.Tensor, input: torch.Tensor
 ) -> None:
@@ -124,13 +111,7 @@ def _fused_marlin_moe(
  hidden_states.dtype == torch.half
  or torch.cuda.get_device_capability(hidden_states.device)[0] >= 9
  )
- # print('printing stuff before moe 1')
- # print_args_info(
- # hidden_states,
- # w1,
- # w1_scale,
- # intermediate_cache1
- # )
+
  intermediate_cache1 = ops.moe_wna16_marlin_gemm(
  hidden_states,
  intermediate_cache1,
 
@@ -2396,8 +2396,6 @@ def create_weights(
  layer.register_parameter("w2_weight_packed", w2_weight_packed)
  set_weight_attrs(w2_weight_packed, extra_weight_attrs)
 
- # TODO(czhu): fix TP > 1 case, probably this and other stuff
- # needs change
  # weight_scale refers to the group-wise scales
  w13_weight_scale = torch.nn.Parameter(
  torch.ones(
@@ -2528,8 +2526,11 @@ def process_weights_after_loading(self, layer):
  )
  replace_parameter(layer, "w2_weight_packed", w2_weight_shuffled)
 
- def maybe_make_prepare_finalize(self) -> mk.FusedMoEPrepareAndFinalize | None:
- return super().maybe_make_prepare_finalize()
+ def maybe_make_prepare_finalize(
+ self,
+ routing_tables: tuple[torch.Tensor, torch.Tensor, torch.Tensor] | None = None,
+ ) -> mk.FusedMoEPrepareAndFinalize | None:
+ return super().maybe_make_prepare_finalize(routing_tables)
 
  def get_fused_moe_quant_config(
  self, layer: torch.nn.Module
 
@@ -74,7 +74,6 @@
  "ChatGLMForConditionalGeneration": ("chatglm", "ChatGLMForCausalLM"),
  "CohereForCausalLM": ("commandr", "CohereForCausalLM"),
  "Cohere2ForCausalLM": ("commandr", "CohereForCausalLM"),
- "Cohere2MoeForCausalLM": ("commandr", "Cohere2MoeForCausalLM"),
  "CwmForCausalLM": ("llama", "LlamaForCausalLM"),
  "DbrxForCausalLM": ("dbrx", "DbrxForCausalLM"),
  "DeciLMForCausalLM": ("nemotron_nas", "DeciLMForCausalLM"),