[torchtitan][replicate] experimenting new replicate integration with torchtitan

anshul-si · anshul-si · commit 7e7b06fad9f0 · 2026-02-11T02:25:30.000-08:00
ghstack-source-id: 69101fb Pull Request resolved: #1714
diff --git a/torchtitan/distributed/utils.py b/torchtitan/distributed/utils.py
@@ -243,14 +243,14 @@ def context():
 def maybe_enable_amp(
     parallel_dims: ParallelDims, mixed_precision_param: str, device_type: str
 ) -> contextlib.AbstractContextManager[None] | torch.autocast:
-    if parallel_dims.fsdp_enabled:
+    if parallel_dims.fsdp_enabled or parallel_dims.dp_replicate_enabled:
         # FSDP handles mixed precision internally
-        logger.info("Mixed precision training is handled by fully_shard")
+        logger.info("Mixed precision training is handled by fully_shard or replicate")
         return contextlib.nullcontext()
     else:
         if parallel_dims.tp_enabled or parallel_dims.pp_enabled:
             logger.warning(
-                "Mixed precision training with TP or PP is only supported when FSDP/HSDP/CP is enabled."
+                "Mixed precision training with TP or PP is only supported when FSDP/HSDP/CP/replicate is enabled."
             )
             logger.info("Mixed precision training is disabled")
             return contextlib.nullcontext()
diff --git a/torchtitan/experiments/rl/unified/actors/trainer.py b/torchtitan/experiments/rl/unified/actors/trainer.py
@@ -62,12 +62,13 @@ def __init__(
 
         # apply PT-D Parallelism
         # TODO: right now it only works for qwen3 model, need to formalize this to use parallize_fn from train_spec
-        from torchtitan.models.llama3.infra.parallelize import apply_ddp
+        from torchtitan.models.llama3.infra.parallelize import apply_replicate
 
-        apply_ddp(
+        apply_replicate(
             self.model,
             self.parallel_dims.get_mesh("dp_replicate"),
-            enable_compile=False,
+            param_dtype=torch.bfloat16,
+            reduce_dtype=torch.float32,
         )
 
         self.model = self.model.to(device)
diff --git a/torchtitan/experiments/transformers_modeling_backend/infra/parallelize.py b/torchtitan/experiments/transformers_modeling_backend/infra/parallelize.py
@@ -18,14 +18,12 @@
 )
 from torchtitan.config import TORCH_DTYPE_MAP
 from torchtitan.distributed import NoParallel, ParallelDims
-
 from torchtitan.distributed.activation_checkpoint import apply_ac
-
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
 from torchtitan.experiments.transformers_modeling_backend.job_config import JobConfig
 from torchtitan.models.llama3.infra.parallelize import (
     apply_compile,
-    apply_ddp,
+    apply_replicate,
     disable_fsdp_gradient_division,
 )
 from torchtitan.tools.logging import logger
@@ -113,13 +111,13 @@ def parallelize_hf_transformers(
         if job_config.training.enable_cpu_offload:
             logger.info("Applied CPU Offloading to the model")
     elif parallel_dims.dp_replicate_enabled:
-        dp_replicate_mesh = parallel_dims.get_mesh("dp_replicate")
-        if parallel_dims.world_size != dp_replicate_mesh.size():
-            raise RuntimeError("DDP has not supported > 1D parallelism")
-        apply_ddp(
+        dp_mesh_names = ["dp_replicate"]
+        dp_mesh = parallel_dims.get_mesh(dp_mesh_names)
+        apply_replicate(
             model,
-            dp_replicate_mesh,
-            enable_compile=model_compile_enabled,
+            dp_mesh,
+            param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param],
+            reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
         )
 
     return model
diff --git a/torchtitan/experiments/vlm/infra/parallelize.py b/torchtitan/experiments/vlm/infra/parallelize.py
@@ -9,18 +9,15 @@
 
 import torch
 import torch.nn as nn
-
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, MixedPrecisionPolicy
-
 from torchtitan.config import JobConfig, TORCH_DTYPE_MAP
 from torchtitan.distributed import ParallelDims
 from torchtitan.distributed.activation_checkpoint import apply_ac
-
 from torchtitan.models.llama3.infra.parallelize import (
     _op_sac_save_list,
     apply_compile,
-    apply_ddp,
+    apply_replicate,
     disable_fsdp_gradient_division,
 )
 from torchtitan.tools.logging import logger
@@ -99,13 +96,13 @@ def parallelize_vlm(
         if job_config.training.enable_cpu_offload:
             logger.info("Applied CPU Offloading to the model")
     elif parallel_dims.dp_replicate_enabled:
-        dp_mesh = parallel_dims.get_mesh("dp_replicate")
-        if dp_mesh is not None and dp_mesh.ndim > 1:
-            raise RuntimeError("DDP has not supported > 1D parallelism")
-        apply_ddp(
+        dp_mesh_names = ["dp_replicate"]
+        dp_mesh = parallel_dims.get_mesh(dp_mesh_names)
+        apply_replicate(
             model,
             dp_mesh,
-            enable_compile=job_config.compile.enable,
+            param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param],
+            reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
         )
 
     return model
diff --git a/torchtitan/models/deepseek_v3/infra/parallelize.py b/torchtitan/models/deepseek_v3/infra/parallelize.py
@@ -21,7 +21,7 @@
 from torchtitan.distributed.context_parallel import apply_cp_to_attention_module
 from torchtitan.distributed.dual_pipe_v import get_dual_pipe_v_flag
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
-from torchtitan.models.llama3.infra.parallelize import apply_ddp
+from torchtitan.models.llama3.infra.parallelize import apply_replicate
 from torchtitan.models.llama4.infra.parallelize import (
     apply_compile,
     apply_fsdp,
@@ -195,13 +195,13 @@ def parallelize_deepseekv3(
         if job_config.training.enable_cpu_offload:
             logger.info("Applied CPU Offloading to the model")
     elif parallel_dims.dp_replicate_enabled:
-        dp_mesh = parallel_dims.get_mesh("dp_replicate")
-        if dp_mesh.ndim > 1:
-            raise RuntimeError("DDP has not supported > 1D parallelism")
-        apply_ddp(
+        dp_mesh_names = ["dp_replicate"]
+        dp_mesh = parallel_dims.get_mesh(dp_mesh_names)
+        apply_replicate(
             model,
             dp_mesh,
-            enable_compile=model_compile_enabled,
+            param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param],
+            reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
         )
 
     return model
diff --git a/torchtitan/models/gpt_oss/infra/parallelize.py b/torchtitan/models/gpt_oss/infra/parallelize.py
@@ -8,7 +8,6 @@
 import torch._inductor.config
 import torch.nn as nn
 from torch.distributed.device_mesh import DeviceMesh
-
 from torch.distributed.tensor import distribute_tensor, Partial, Replicate, Shard
 from torch.distributed.tensor.parallel import (
     ColwiseParallel,
@@ -31,7 +30,7 @@
     ExpertParallel,
     ReordererSequenceParallel,
 )
-from torchtitan.models.llama3.infra.parallelize import apply_ddp
+from torchtitan.models.llama3.infra.parallelize import apply_replicate
 from torchtitan.models.llama4.infra.parallelize import apply_fsdp
 from torchtitan.tools.logging import logger
 
@@ -160,13 +159,13 @@ def parallelize_gptoss(
         if job_config.training.enable_cpu_offload:
             logger.info("Applied CPU Offloading to the model")
     elif parallel_dims.dp_replicate_enabled:
-        dp_mesh = parallel_dims.get_mesh("dp_replicate")
-        if dp_mesh is not None and dp_mesh.ndim > 1:
-            raise RuntimeError("DDP has not supported > 1D parallelism")
-        apply_ddp(
+        dp_mesh_names = ["dp_replicate"]
+        dp_mesh = parallel_dims.get_mesh(dp_mesh_names)
+        apply_replicate(
             model,
             dp_mesh,
-            enable_compile=model_compile_enabled,
+            param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param],
+            reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
         )
 
     return model
diff --git a/torchtitan/models/llama3/infra/parallelize.py b/torchtitan/models/llama3/infra/parallelize.py
@@ -10,8 +10,7 @@
 import torch
 import torch.nn as nn
 from torch.distributed._composable.fsdp import FSDPModule
-from torch.distributed._composable.replicate import replicate
-
+from torch.distributed._composable.replicate_with_fsdp import replicate
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard, MixedPrecisionPolicy
 from torch.distributed.tensor import Replicate, Shard
@@ -22,7 +21,6 @@
     RowwiseParallel,
     SequenceParallel,
 )
-
 from torchtitan.config import JobConfig, TORCH_DTYPE_MAP
 from torchtitan.config.job_config import Compile as CompileConfig
 from torchtitan.distributed import ParallelDims
@@ -146,13 +144,13 @@ def parallelize_llama(
         if job_config.training.enable_cpu_offload:
             logger.info("Applied CPU Offloading to the model")
     elif parallel_dims.dp_replicate_enabled:
-        dp_replicate_mesh = parallel_dims.get_mesh("dp_replicate")
-        if parallel_dims.world_size != dp_replicate_mesh.size():
-            raise RuntimeError("DDP has not supported > 1D parallelism")
-        apply_ddp(
+        names = ["dp_replicate"]
+        dp_mesh = parallel_dims.get_mesh(names)
+        apply_replicate(
             model,
-            dp_replicate_mesh,
-            enable_compile=model_compile_enabled,
+            dp_mesh,
+            param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param],
+            reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
         )
 
     return model
@@ -276,8 +274,10 @@ def disable_fsdp_gradient_division(model: nn.Module) -> None:
     Set gradient_divide_factor=1.0 to disable FSDP's automatic gradient division.
     We handle gradient scaling ourselves in the training loop with global token count.
 
+    Note: This also works for ReplicateModule since it inherits from FSDPModule.
+
     Args:
-        model: The model containing FSDP-wrapped modules
+        model: The model containing FSDP-wrapped or Replicate-wrapped modules
     """
     for module in model.modules():
         if isinstance(module, FSDPModule):
@@ -360,15 +360,37 @@ def apply_fsdp(
     disable_fsdp_gradient_division(model)
 
 
-def apply_ddp(
+def apply_replicate(
     model: nn.Module,
     dp_mesh: DeviceMesh,
-    enable_compile: bool,
+    param_dtype: torch.dtype,
+    reduce_dtype: torch.dtype,
 ):
-    if enable_compile:
-        torch._dynamo.config.optimize_ddp = "ddp_optimizer"
+    mp_policy = MixedPrecisionPolicy(param_dtype=param_dtype, reduce_dtype=reduce_dtype)
+    replicate_config = {"mesh": dp_mesh, "mp_policy": mp_policy}
 
-    # pyrefly: ignore [invalid-param-spec]
-    replicate(model, device_mesh=dp_mesh, bucket_cap_mb=100)
+    if model.tok_embeddings is not None:
+        # pyrefly: ignore [no-matching-overload]
+        replicate(
+            model.tok_embeddings,
+            **replicate_config,
+        )
+    # pyrefly: ignore [missing-attribute]
+    for layer_id, transformer_block in model.layers.items():
+        replicate(
+            transformer_block,
+            **replicate_config,
+        )
+
+    if model.norm is not None and model.output is not None:
+        # pyrefly: ignore [no-matching-overload]
+        replicate(
+            [model.norm, model.output],
+            **replicate_config,
+        )
+    replicate(model, **replicate_config)
+
+    # Disable Replicate's automatic gradient division (ReplicateModule inherits from FSDPModule)
+    disable_fsdp_gradient_division(model)
 
-    logger.info("Applied DDP to the model")
+    logger.info("Applied replicate to the model")
diff --git a/torchtitan/models/llama4/infra/parallelize.py b/torchtitan/models/llama4/infra/parallelize.py
@@ -41,7 +41,7 @@
 )
 from torchtitan.distributed.tensor_parallel import maybe_enable_async_tp
 from torchtitan.models.llama3.infra.parallelize import (
-    apply_ddp,
+    apply_replicate,
     disable_fsdp_gradient_division,
 )
 from torchtitan.models.moe import moe as moe_module
@@ -214,13 +214,13 @@ def parallelize_llama(
         if job_config.training.enable_cpu_offload:
             logger.info("Applied CPU Offloading to the model")
     elif parallel_dims.dp_replicate_enabled:
-        dp_mesh = parallel_dims.get_mesh("dp_replicate")
-        if parallel_dims.world_size != dp_mesh.size():
-            raise RuntimeError("DDP has not supported > 1D parallelism")
-        apply_ddp(
+        dp_mesh_names = ["dp_replicate"]
+        dp_mesh = parallel_dims.get_mesh(dp_mesh_names)
+        apply_replicate(
             model,
             dp_mesh,
-            enable_compile=model_compile_enabled,
+            param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param],
+            reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
         )
 
     return model
diff --git a/torchtitan/models/qwen3/infra/parallelize.py b/torchtitan/models/qwen3/infra/parallelize.py
@@ -10,7 +10,6 @@
 import torch
 import torch._inductor.config
 import torch.nn as nn
-
 from torch.distributed.device_mesh import DeviceMesh
 from torch.distributed.tensor import Replicate, Shard
 from torch.distributed.tensor.parallel import (
@@ -20,13 +19,12 @@
     RowwiseParallel,
     SequenceParallel,
 )
-
 from torchtitan.config import JobConfig, TORCH_DTYPE_MAP
 from torchtitan.distributed import ParallelDims
 from torchtitan.distributed.activation_checkpoint import apply_ac
 from torchtitan.distributed.context_parallel import apply_cp_to_attention_module
 from torchtitan.distributed.dual_pipe_v import get_dual_pipe_v_flag
-from torchtitan.models.llama3.infra.parallelize import apply_ddp
+from torchtitan.models.llama3.infra.parallelize import apply_replicate
 from torchtitan.models.llama4.infra.parallelize import (
     apply_compile,
     apply_fsdp,
@@ -175,13 +173,13 @@ def parallelize_qwen3(
         if job_config.training.enable_cpu_offload:
             logger.info("Applied CPU Offloading to the model")
     elif parallel_dims.dp_replicate_enabled:
-        dp_mesh = parallel_dims.get_mesh("dp_replicate")
-        if dp_mesh.ndim > 1:
-            raise RuntimeError("DDP has not supported > 1D parallelism")
-        apply_ddp(
+        dp_mesh_names = ["dp_replicate"]
+        dp_mesh = parallel_dims.get_mesh(dp_mesh_names)
+        apply_replicate(
             model,
             dp_mesh,
-            enable_compile=model_compile_enabled,
+            param_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_param],
+            reduce_dtype=TORCH_DTYPE_MAP[job_config.training.mixed_precision_reduce],
         )
 
     return model