PaddlePaddle · Xing-lil · Apr 3, 2026 · Apr 3, 2026 · Apr 9, 2026
diff --git a/python/paddle/distributed/auto_parallel/fully_shard_fusion.py b/python/paddle/distributed/auto_parallel/fully_shard_fusion.py
@@ -26,18 +26,9 @@
     alignment,
     get_current_device_type,
 )
-
-# Global registry for fsdp_context
-_g_fsdp_context = None
-
-
-def register_fsdp_context(context):
-    global _g_fsdp_context
-    _g_fsdp_context = context
-
-
-def get_fsdp_context():
-    return _g_fsdp_context
+from paddle.distributed.fsdp._fsdp_context import (
+    register_fsdp_context,
+)
 
 
 class BufferState(Enum):

diff --git a/...n/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py b/...n/paddle/distributed/fleet/meta_optimizers/dygraph_optimizer/hybrid_parallel_optimizer.py
@@ -138,7 +138,12 @@ def _dygraph_clip(self, params_grads):
                 and getattr(p, 'is_firstly_shared', True)
             )
 
-            if not_shared_enable:
+            from paddle.distributed.fsdp._fsdp_context import (
+                get_fsdp_context,
+            )
+
+            fsdp_context = get_fsdp_context()
+            if not_shared_enable and fsdp_context is None:
                 if p.is_distributed:
                     if g.dtype == paddle.float16:
                         sum_square_dist_fp16.append(sum_square)
@@ -280,7 +285,12 @@ class HybridParallelOptimizer:
     def __init__(self, optimizer, hcg, strategy):
         # Note: Only sharding stage 1 is considered in HybridParallelOptimizer.
         # The sharding stage2 and stage3 optimizers are invoked in other api.
-        if hcg.get_sharding_parallel_world_size() > 1:
+        from paddle.distributed.fsdp._fsdp_context import (
+            get_fsdp_context,
+        )
+
+        fsdp_context = get_fsdp_context()
+        if hcg.get_sharding_parallel_world_size() > 1 and fsdp_context is None:
             split_param = strategy.hybrid_configs[
                 'sharding_configs'
             ].split_param

diff --git a/python/paddle/distributed/fsdp/_fsdp_context.py b/python/paddle/distributed/fsdp/_fsdp_context.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+Shared FSDP context module.
+This module provides a unified global registry for fsdp_context,
+used by both fsdp.fully_shard_fusion and auto_parallel.fully_shard_fusion.
+"""
+
+# Global registry for fsdp_context
+_g_fsdp_context = None
+
+
+def register_fsdp_context(context):
+    global _g_fsdp_context
+    _g_fsdp_context = context
+
+
+def get_fsdp_context():
+    return _g_fsdp_context
diff --git a/python/paddle/distributed/fsdp/fully_shard.py b/python/paddle/distributed/fsdp/fully_shard.py
@@ -22,9 +22,8 @@
     import paddle.distributed as dist
 import paddle
 from paddle.distributed.auto_parallel.fully_shard import FullyShardAuto
-from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_fully_shard import (
-    FullyShard,
-)
+
+from .fully_shard_fusion import FullyShardFusion
 
 
 def in_auto_parallel_mode() -> bool:
@@ -56,7 +55,8 @@ def _fully_shard_manual_parallel(
     ignored_params,
     enable_tensor_fusion_and_overlap,
 ):
-    return FullyShard(module)
+    FullyShardFusion(module)
+    return module
 
 
 def _fully_shard_auto_parallel(