[Cherry-Pick][Optimization] Use a separate driver when using Triton with Paddle (#6897) (#7114)

SigureMo · Copilot · web-flow · commit 57b97d3a1a7b · 2026-04-01T15:22:34.000+08:00
---------

Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/fastdeploy/__init__.py b/fastdeploy/__init__.py
@@ -17,6 +17,8 @@
 import os
 import uuid
 
+import paddle
+
 # suppress warning log from paddlepaddle
 os.environ["GLOG_minloglevel"] = "2"
 # suppress log from aistudio
@@ -44,7 +46,13 @@
 
 from fastdeploy.engine.sampling_params import SamplingParams
 from fastdeploy.entrypoints.llm import LLM
-from fastdeploy.utils import current_package_version, envs
+from fastdeploy.utils import _is_package_installed, current_package_version, envs
+
+# We can use enable_compat only when torch is not installed, otherwise it will
+# cause some unexpected issues in triton kernels. We use enable_compat_on_triton_kernel
+# for these cases.
+if not _is_package_installed("torch"):
+    paddle.enable_compat(scope={"triton"})
 
 if envs.FD_DEBUG != 1:
     import logging
diff --git a/fastdeploy/model_executor/guided_decoding/kernels/xgrammar_apply_token_bitmask.py b/fastdeploy/model_executor/guided_decoding/kernels/xgrammar_apply_token_bitmask.py
@@ -23,10 +23,15 @@
 try:
     import triton
     import triton.language as tl
+
+    from fastdeploy.model_executor.ops.triton_ops.triton_utils import (
+        enable_compat_on_triton_kernel,
+    )
 except ImportError as err:
     raise ImportError("Triton is not installed") from err
 
 
+@enable_compat_on_triton_kernel
 @triton.jit
 def apply_token_bitmask_inplace_kernel(
     logits_ptr,
diff --git a/fastdeploy/model_executor/layers/backends/dcu/triton_moe_kernels.py b/fastdeploy/model_executor/layers/backends/dcu/triton_moe_kernels.py
@@ -17,7 +17,12 @@
 import triton
 import triton.language as tl
 
+from fastdeploy.model_executor.ops.triton_ops.triton_utils import (
+    enable_compat_on_triton_kernel,
+)
 
+
+@enable_compat_on_triton_kernel
 @triton.jit
 def fused_moe_kernel_paddle(
     a_ptr,
diff --git a/fastdeploy/model_executor/layers/backends/metax/moe/triton_moe_kernels.py b/fastdeploy/model_executor/layers/backends/metax/moe/triton_moe_kernels.py
@@ -15,7 +15,12 @@
 import triton
 import triton.language as tl
 
+from fastdeploy.model_executor.ops.triton_ops.triton_utils import (
+    enable_compat_on_triton_kernel,
+)
 
+
+@enable_compat_on_triton_kernel
 @triton.jit()
 def fused_moe_kernel_paddle(
     a_ptr,
diff --git a/fastdeploy/model_executor/layers/batch_invariant_ops/batch_invariant_ops.py b/fastdeploy/model_executor/layers/batch_invariant_ops/batch_invariant_ops.py
@@ -10,6 +10,10 @@
 import triton
 import triton.language as tl
 
+from fastdeploy.model_executor.ops.triton_ops.triton_utils import (
+    enable_compat_on_triton_kernel,
+)
+
 __all__ = [
     "set_batch_invariant_mode",
     "is_batch_invariant_mode_enabled",
@@ -33,6 +37,7 @@ def _matmul_launch_metadata(grid: Callable[..., Any], kernel: Any, args: Dict[st
     return ret
 
 
+@enable_compat_on_triton_kernel
 @triton.jit
 def _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS):
     group_id = tile_id // num_pid_in_group
@@ -43,6 +48,7 @@ def _compute_pid(tile_id, num_pid_in_group, num_pid_m, GROUP_SIZE_M, NUM_SMS):
     return pid_m, pid_n
 
 
+@enable_compat_on_triton_kernel
 @triton.jit(launch_metadata=_matmul_launch_metadata)
 def matmul_kernel_persistent(
     a_ptr,
@@ -220,6 +226,7 @@ def grid(META):
     return c
 
 
+@enable_compat_on_triton_kernel
 @triton.jit
 def _log_softmax_kernel(
     input_ptr,
@@ -324,6 +331,7 @@ def log_softmax(input: paddle.Tensor, axis: int = -1) -> paddle.Tensor:
     return output.reshape(original_shape)
 
 
+@enable_compat_on_triton_kernel
 @triton.jit
 def mean_kernel(
     input_ptr,
diff --git a/fastdeploy/model_executor/layers/moe/routing_indices_cache.py b/fastdeploy/model_executor/layers/moe/routing_indices_cache.py
@@ -36,8 +36,12 @@
 from paddleformers.utils.log import logger
 
 from fastdeploy.config import FDConfig, RoutingReplayConfig
+from fastdeploy.model_executor.ops.triton_ops.triton_utils import (
+    enable_compat_on_triton_kernel,
+)
 
 
+@enable_compat_on_triton_kernel
 @triton.jit
 def _save_routing_kernel(
     ROUTING_REPLAY_TABLE_PTR,
diff --git a/fastdeploy/model_executor/layers/moe/triton_moe_kernels.py b/fastdeploy/model_executor/layers/moe/triton_moe_kernels.py
@@ -17,7 +17,12 @@
 import triton
 import triton.language as tl
 
+from fastdeploy.model_executor.ops.triton_ops.triton_utils import (
+    enable_compat_on_triton_kernel,
+)
 
+
+@enable_compat_on_triton_kernel
 @triton.jit()
 def fused_moe_kernel_paddle(
     a_ptr,
diff --git a/fastdeploy/model_executor/layers/sample/logprobs.py b/fastdeploy/model_executor/layers/sample/logprobs.py
@@ -18,9 +18,13 @@
 import triton
 import triton.language as tl
 
+from fastdeploy.model_executor.ops.triton_ops.triton_utils import (
+    enable_compat_on_triton_kernel,
+)
 from fastdeploy.platforms import current_platform
 
 
+@enable_compat_on_triton_kernel
 @triton.jit
 def count_greater_kernel(
     x_ptr,  # [num_tokens, n_elements]
diff --git a/fastdeploy/model_executor/ops/triton_ops/repetition_early_stop_kernel.py b/fastdeploy/model_executor/ops/triton_ops/repetition_early_stop_kernel.py
@@ -17,7 +17,12 @@
 import triton
 import triton.language as tl
 
+from fastdeploy.model_executor.ops.triton_ops.triton_utils import (
+    enable_compat_on_triton_kernel,
+)
 
+
+@enable_compat_on_triton_kernel
 @triton.jit
 def repetition_early_stopper_kernel(
     trunc_ptr,  # float32[B, W]
diff --git a/fastdeploy/model_executor/ops/triton_ops/triton_utils.py b/fastdeploy/model_executor/ops/triton_ops/triton_utils.py
@@ -24,19 +24,45 @@
 from paddle.base.framework import OpProtoHolder
 
 from fastdeploy import envs
+from fastdeploy.utils import _is_package_installed
 
 compile_file = triton.__path__[0] + "/tools/compile.py"
 link_file = triton.__path__[0] + "/tools/link.py"
 python_path = sys.executable
 
+if _is_package_installed("torch"):
+    with paddle.use_compat_guard(enable=True, silent=True):
+        from triton.runtime.driver import _create_driver
+
+        paddle_driver = _create_driver()
+
+
+def swap_driver_guard(fn):
+    from triton.runtime.driver import driver
+
+    # A lightweight wrapper to enable compatibility for triton kernel
+    def wrapped_fn(*args, **kwargs):
+        driver.set_active(paddle_driver)
+        try:
+            return fn(*args, **kwargs)
+        finally:
+            driver.reset_active()
+
+    return wrapped_fn
+
 
 def enable_compat_on_triton_kernel(triton_kernel):
+    # When torch is not installed, this decorator does not do anything, just return the original triton kernel.
+    # Because the `paddle.enable_compat(scope={"triton"})` already enabled in `__init__.py`, it will take zero runtime overhead.
+    if not _is_package_installed("torch"):
+        return triton_kernel
+
     class WrappedTritonKernel:
         def __init__(self, kernel):
             self.kernel = kernel
 
         def __getitem__(self, index):
-            return paddle.use_compat_guard(enable=True, silent=True)(self.kernel[index])
+            return swap_driver_guard(self.kernel[index])
 
     return WrappedTritonKernel(triton_kernel)
 
diff --git a/fastdeploy/utils.py b/fastdeploy/utils.py
@@ -32,6 +32,7 @@
 import traceback
 from datetime import datetime
 from enum import Enum
+from functools import cache
 from http import HTTPStatus
 from importlib.metadata import PackageNotFoundError, distribution
 from logging.handlers import BaseRotatingHandler
@@ -1172,3 +1173,12 @@ def fill_paddle_tensor(shared_inputs, key, value):
             shared_inputs[key].fill_(value)
     except Exception as e:
         llm_logger.warning(f"Failed to fill key {key} with value {value}: {e}")
+
+
+@cache
+def _is_package_installed(dist_name: str) -> bool:
+    try:
+        distribution(dist_name)
+        return True
+    except PackageNotFoundError:
+        return False