fix MTP bugs in TP and overlap (#7172)

huicongyao · web-flow · commit 095a11d93258 · 2026-04-03T14:19:11.000+08:00
* fix MTP bugs in TP and overlap

* fix
diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc b/custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc
@@ -36,8 +36,9 @@ void SpeculateSaveWithOutputMsg(const paddle::Tensor& accept_tokens,
                                 int msg_queue_id,
                                 int save_each_rank,
                                 bool skip_prefill) {
-  // printf("enter save output");
-  if (!save_each_rank && rank_id > 0) {
+  // NOTE(yaohuicong): Skip non-zero TP ranks — they share identical sampling
+  // outputs, so only rank 0 needs to send results to the message queue.
+  if (rank_id > 0) {
     return;
   }
 
diff --git a/custom_ops/gpu_ops/speculate_decoding/speculate_save_output_with_topk.cc b/custom_ops/gpu_ops/speculate_decoding/speculate_save_output_with_topk.cc
@@ -53,7 +53,9 @@ void SpeculateSaveOutMmsgTopK(const paddle::Tensor& sampled_token_ids,
                               int message_flag,  // Target: 3, Draft: 4
                               int64_t rank_id,
                               bool save_each_rank) {
-  if (!save_each_rank && rank_id > 0) {
+  // NOTE(yaohuicong): Skip non-zero TP ranks — they share identical sampling
+  // outputs, so only rank 0 needs to send results to the message queue.
+  if (rank_id > 0) {
     return;
   }
 
diff --git a/fastdeploy/worker/gpu_model_runner.py b/fastdeploy/worker/gpu_model_runner.py
@@ -345,9 +345,7 @@ def _predict_next_launch_token_num(self) -> int:
         is_block_step_cpu = self.share_inputs["is_block_step_cpu"].numpy()
         next_real_bsz = (seq_lens_this_time_cpu > 0).sum().item() + (is_block_step_cpu > 0).sum().item()
         token_num_one_step = (self.speculative_config.num_speculative_tokens + 1) if self.speculative_decoding else 1
-        next_launch_token_num = (
-            seq_lens_this_time_cpu.sum().item() + is_block_step_cpu.sum().item() * token_num_one_step
-        )
+        next_launch_token_num = next_real_bsz * token_num_one_step
         return next_launch_token_num, next_real_bsz
 
     def only_prefill(self):