Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 3 additions & 12 deletions python/paddle/distributed/auto_parallel/fully_shard_fusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,18 +26,9 @@
alignment,
get_current_device_type,
)

# Global registry for fsdp_context
_g_fsdp_context = None


def register_fsdp_context(context):
global _g_fsdp_context
_g_fsdp_context = context


def get_fsdp_context():
return _g_fsdp_context
from paddle.distributed.fsdp._fsdp_context import (
register_fsdp_context,
)


class BufferState(Enum):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,12 @@ def _dygraph_clip(self, params_grads):
and getattr(p, 'is_firstly_shared', True)
)

if not_shared_enable:
from paddle.distributed.fsdp._fsdp_context import (
get_fsdp_context,
)

fsdp_context = get_fsdp_context()
if not_shared_enable and fsdp_context is None:
if p.is_distributed:
if g.dtype == paddle.float16:
sum_square_dist_fp16.append(sum_square)
Expand Down Expand Up @@ -280,7 +285,12 @@ class HybridParallelOptimizer:
def __init__(self, optimizer, hcg, strategy):
# Note: Only sharding stage 1 is considered in HybridParallelOptimizer.
# The sharding stage2 and stage3 optimizers are invoked in other api.
if hcg.get_sharding_parallel_world_size() > 1:
from paddle.distributed.fsdp._fsdp_context import (
get_fsdp_context,
)

fsdp_context = get_fsdp_context()
if hcg.get_sharding_parallel_world_size() > 1 and fsdp_context is None:
split_param = strategy.hybrid_configs[
'sharding_configs'
].split_param
Expand Down
31 changes: 31 additions & 0 deletions python/paddle/distributed/fsdp/_fsdp_context.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Copyright (c) 2026 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Shared FSDP context module.
This module provides a unified global registry for fsdp_context,
used by both fsdp.fully_shard_fusion and auto_parallel.fully_shard_fusion.
"""

# Global registry for fsdp_context
_g_fsdp_context = None


def register_fsdp_context(context):
global _g_fsdp_context
_g_fsdp_context = context


def get_fsdp_context():
return _g_fsdp_context
8 changes: 4 additions & 4 deletions python/paddle/distributed/fsdp/fully_shard.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,8 @@
import paddle.distributed as dist
import paddle
from paddle.distributed.auto_parallel.fully_shard import FullyShardAuto
from paddle.distributed.fleet.meta_parallel.sharding.group_sharded_fully_shard import (
FullyShard,
)

from .fully_shard_fusion import FullyShardFusion


def in_auto_parallel_mode() -> bool:
Expand Down Expand Up @@ -56,7 +55,8 @@ def _fully_shard_manual_parallel(
ignored_params,
enable_tensor_fusion_and_overlap,
):
return FullyShard(module)
FullyShardFusion(module)
return module


def _fully_shard_auto_parallel(
Expand Down
Loading
Loading