rebase

H-Huang · H-Huang · commit d4a7e9b1a84c · 2025-12-17T08:12:03.000-08:00
diff --git a/torchtitan/distributed/dual_pipe_v.py b/torchtitan/distributed/dual_pipe_v.py
@@ -43,10 +43,11 @@ def get_dual_pipe_v_flag(job_config, parallel_dims) -> bool:
     )
 
     if dual_pipe_v and job_config.activation_checkpoint.mode != "none":
-        raise NotImplementedError(
-            "Expert Parallel with DualPipeV and Activation Checkpointing "
-            "cannot be used together. Please disable one of them."
-        )
+        pass
+        # raise NotImplementedError(
+        #     "Expert Parallel with DualPipeV and Activation Checkpointing "
+        #     "cannot be used together. Please disable one of them."
+        # )
 
     return dual_pipe_v
 
@@ -96,6 +97,11 @@ def _apply(self, module: nn.Module, device_mesh: DeviceMesh) -> nn.Module:
         )
 
 
+# Thread-local flag to track if we're in the backward thread
+# Any SyncHook.forward call from the backward thread is checkpoint recomputation
+_backward_thread_flag = threading.local()
+
+
 class HookCoordinator:
     def __init__(self):
         # Barrier for 2 threads (forward and backward) to synchronize
@@ -139,6 +145,16 @@ def is_coordination_enabled(self):
         return self._coordination_enabled
 
 
+def _is_in_backward_thread() -> bool:
+    """Check if current thread is the backward thread."""
+    return getattr(_backward_thread_flag, 'value', False)
+
+
+def _set_backward_thread_flag(value: bool):
+    """Set the backward thread flag for current thread."""
+    _backward_thread_flag.value = value
+
+
 # Global coordinator
 _hook_coordinator = HookCoordinator()
 
@@ -147,6 +163,16 @@ class SyncHook(torch.autograd.Function):
     @staticmethod
     def forward(ctx, x, hook_name=""):
         ctx.hook_name = hook_name
+
+        # Skip barrier if we're in the backward thread - this means we're being called
+        # during checkpoint recomputation (the forward thread never sets this flag)
+        if _is_in_backward_thread():
+            print("skipping backward barrier", flush=True)
+            ctx.skip_backward_barrier = True
+            return x
+
+        ctx.skip_backward_barrier = False
+
         # handle edge case for transformer level boundary
         if _hook_coordinator._coordination_enabled and hook_name == "D":
             _hook_coordinator._cycle_count += 1
@@ -161,6 +187,13 @@ def forward(ctx, x, hook_name=""):
     def backward(ctx, grad_output):
         hook_name = ctx.hook_name
 
+        # Skip barrier if this backward corresponds to a checkpoint recompute forward
+        # These are "extra" backward nodes created by checkpoint that don't have
+        # corresponding partners in the other thread
+        if ctx.skip_backward_barrier:
+            print("skipping backward barrier", flush=True)
+            return grad_output, None
+
         # Edge case, skip initial barrier, all subsequent backward hooks will acquire
         if hook_name == "D" and _hook_coordinator._cycle_count == 0:
             return grad_output, None
@@ -262,32 +295,44 @@ def overlap_callback(action: _Action, ctx: _PipelineContext):
     main_stream = torch.accelerator.current_stream(device_module)
 
     # Shared container for exception from backward thread
-    def run_backward():
-        schedule._assert_unsharded(backward_stage)
-        # Set the backward thread to use the same stream as forward
-        device_module.set_stream(main_stream)
-        with record_function(
-            f"backward_stage_{backward_stage_index}_mb_{backward_mb_index}"
-        ):
-            loss = schedule._maybe_get_loss(backward_stage, backward_mb_index)
-            schedule.backward_counter[backward_stage_index] += 1
-            last_backward = (
-                schedule.backward_counter[backward_stage_index]
-                == schedule._n_microbatches
-            )
-            backward_stage.backward_one_chunk(
-                backward_mb_index,
-                loss=loss,
-                full_backward=True,
-                last_backward=last_backward,
-            )
+    backward_exception: list[BaseException] = []
 
-            if backward_is_prev_stage_on_this_rank:
-                stage_index_to_stage[backward_stage_index - 1].set_local_bwd_input(
-                    backward_stage.get_local_bwd_output(backward_mb_index),
+    def run_backward():
+        # Mark this thread as the backward thread so SyncHook.forward
+        # can detect checkpoint recomputation (forward called from backward thread)
+        _set_backward_thread_flag(True)
+        try:
+            schedule._assert_unsharded(backward_stage)
+            # Set the backward thread to use the same stream as forward
+            device_module.set_stream(main_stream)
+            with record_function(
+                f"backward_stage_{backward_stage_index}_mb_{backward_mb_index}"
+            ):
+                loss = schedule._maybe_get_loss(backward_stage, backward_mb_index)
+                schedule.backward_counter[backward_stage_index] += 1
+                last_backward = (
+                    schedule.backward_counter[backward_stage_index]
+                    == schedule._n_microbatches
+                )
+                backward_stage.backward_one_chunk(
                     backward_mb_index,
+                    loss=loss,
+                    full_backward=True,
+                    last_backward=last_backward,
                 )
 
+                if backward_is_prev_stage_on_this_rank:
+                    stage_index_to_stage[backward_stage_index - 1].set_local_bwd_input(
+                        backward_stage.get_local_bwd_output(backward_mb_index),
+                        backward_mb_index,
+                    )
+        except BaseException as e:
+            backward_exception.append(e)
+            # Abort barrier to unblock forward thread if it's waiting
+            _hook_coordinator.disable_coordination()
+        finally:
+            _set_backward_thread_flag(False)
+
     def run_forward():
         schedule._assert_unsharded(forward_stage)
         output = forward_stage.forward_one_chunk(
@@ -315,3 +360,7 @@ def run_forward():
     thread.join()
 
     _hook_coordinator.disable_coordination()
+
+    # Re-raise exception from backward thread with full traceback
+    if backward_exception:
+        raise backward_exception[0]
diff --git a/torchtitan/models/deepseek_v3/__init__.py b/torchtitan/models/deepseek_v3/__init__.py
@@ -32,7 +32,7 @@
         dim=256,
         inter_dim=1024,
         moe_inter_dim=256,
-        n_layers=6,
+        n_layers=24,
         n_dense_layers=1,
         n_heads=16,
         moe_args=MoEArgs(
diff --git a/torchtitan/models/deepseek_v3/train_configs/debug_model.toml b/torchtitan/models/deepseek_v3/train_configs/debug_model.toml
@@ -4,7 +4,7 @@ description = "DeepSeek-V3 debug training"
 print_config = false
 
 [profiling]
-enable_profiling = true
+enable_profiling = false
 save_traces_folder = "profile_trace"
 profile_freq = 1
 profiler_warmup = 0
diff --git a/torchtitan/models/llama4/infra/parallelize.py b/torchtitan/models/llama4/infra/parallelize.py
@@ -24,15 +24,10 @@
 from torchtitan.config.job_config import Compile as CompileConfig
 from torchtitan.distributed import NoParallel, ParallelDims
 from torchtitan.distributed.activation_checkpoint import apply_ac
-<<<<<<< HEAD
 from torchtitan.distributed.dual_pipe_v import (
     DualPipeExpertParallel,
     get_dual_pipe_v_flag,
 )
-=======
-from torchtitan.distributed.dual_pipe_v import DualPipeExpertParallel
->>>>>>> f3e551fb (Enable PP and EP overlap for MoE)
-
 from torchtitan.distributed.expert_parallel import (
     BaseExpertParallel,
     ExpertParallel,
@@ -123,12 +118,7 @@ def parallelize_llama(
                 else None
             ),
             etp_enabled=parallel_dims.etp_enabled,
-<<<<<<< HEAD
             dual_pipe_v=dual_pipe_v,
-=======
-            dual_pipe_v=job_config.parallelism.pipeline_parallel_expert_parallel_overlap
-            and job_config.parallelism.pipeline_parallel_schedule == "dualpipev",
->>>>>>> f3e551fb (Enable PP and EP overlap for MoE)
         )
 
     model_compile_enabled = (
@@ -513,7 +503,7 @@ def apply_moe_ep_tp(
         elif tp_mesh is None or not etp_enabled:
             experts_mesh = ep_mesh
             # input / output sharding on the batch / tokens dim
-            experts_plan = DualPipeExpertParallel() if dual_pipe_v else ExpertParallel()
+            experts_plan = ExpertParallel()
         else:
             experts_mesh = ep_tp_mesh
             experts_plan = ExpertTensorParallel()