Skip to content

Commit 095a11d

Browse files
authored
fix MTP bugs in TP and overlap (#7172)
* fix MTP bugs in TP and overlap * fix
1 parent 3b8dac3 commit 095a11d

File tree

3 files changed

+7
-6
lines changed

3 files changed

+7
-6
lines changed

custom_ops/gpu_ops/speculate_decoding/speculate_save_output.cc

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,8 +36,9 @@ void SpeculateSaveWithOutputMsg(const paddle::Tensor& accept_tokens,
3636
int msg_queue_id,
3737
int save_each_rank,
3838
bool skip_prefill) {
39-
// printf("enter save output");
40-
if (!save_each_rank && rank_id > 0) {
39+
// NOTE(yaohuicong): Skip non-zero TP ranks — they share identical sampling
40+
// outputs, so only rank 0 needs to send results to the message queue.
41+
if (rank_id > 0) {
4142
return;
4243
}
4344

custom_ops/gpu_ops/speculate_decoding/speculate_save_output_with_topk.cc

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,9 @@ void SpeculateSaveOutMmsgTopK(const paddle::Tensor& sampled_token_ids,
5353
int message_flag, // Target: 3, Draft: 4
5454
int64_t rank_id,
5555
bool save_each_rank) {
56-
if (!save_each_rank && rank_id > 0) {
56+
// NOTE(yaohuicong): Skip non-zero TP ranks — they share identical sampling
57+
// outputs, so only rank 0 needs to send results to the message queue.
58+
if (rank_id > 0) {
5759
return;
5860
}
5961

fastdeploy/worker/gpu_model_runner.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -345,9 +345,7 @@ def _predict_next_launch_token_num(self) -> int:
345345
is_block_step_cpu = self.share_inputs["is_block_step_cpu"].numpy()
346346
next_real_bsz = (seq_lens_this_time_cpu > 0).sum().item() + (is_block_step_cpu > 0).sum().item()
347347
token_num_one_step = (self.speculative_config.num_speculative_tokens + 1) if self.speculative_decoding else 1
348-
next_launch_token_num = (
349-
seq_lens_this_time_cpu.sum().item() + is_block_step_cpu.sum().item() * token_num_one_step
350-
)
348+
next_launch_token_num = next_real_bsz * token_num_one_step
351349
return next_launch_token_num, next_real_bsz
352350

353351
def only_prefill(self):

0 commit comments

Comments
 (0)