pytorch
diff --git a/‎torchtitan/components/validate.py‎
Lines changed: 85 additions & 10 deletions b/‎torchtitan/components/validate.py‎
Lines changed: 85 additions & 10 deletions
diff --git a/‎torchtitan/distributed/context_parallel.py‎
Lines changed: 192 additions & 0 deletions b/‎torchtitan/distributed/context_parallel.py‎
Lines changed: 192 additions & 0 deletions
diff --git a/‎torchtitan/distributed/utils.py‎
Lines changed: 2 additions & 8 deletions b/‎torchtitan/distributed/utils.py‎
Lines changed: 2 additions & 8 deletions
@@ -6,7 +6,7 @@
 
 from collections.abc import Callable
 from contextlib import AbstractContextManager
-from typing import TypeAlias
+from typing import Any, TypeAlias
 
 import torch
 import torch.nn as nn
@@ -17,14 +17,12 @@
 from torchtitan.components.tokenizer import BaseTokenizer
 from torchtitan.config import JobConfig
 from torchtitan.distributed import ParallelDims, utils as dist_utils
+from torchtitan.distributed.context_parallel import prepare_context_parallel_input
 from torchtitan.hf_datasets.text_datasets import build_text_validation_dataloader
 from torchtitan.tools import utils
 from torchtitan.tools.logging import logger
 
-ValidationContext: TypeAlias = Callable[
-    [AbstractContextManager[None] | None],
-    AbstractContextManager[None],
-]
+ValidationContext: TypeAlias = Callable[[], AbstractContextManager[None]]
 
 
 class BaseValidator:
@@ -67,6 +65,7 @@ def __init__(
         pp_has_last_stage: bool | None = None,
     ):
         self.job_config = job_config
+        self.tokenizer = tokenizer
         self.parallel_dims = parallel_dims
         self.loss_fn = loss_fn
         self.validation_dataloader = build_text_validation_dataloader(
@@ -89,6 +88,70 @@ def __init__(
                 "unequal sample counts across ranks when dataset is exhausted."
             )
 
+    def post_dataloading_process(
+        self,
+        input_dict: dict[str, torch.Tensor],
+        labels: torch.Tensor,
+        model_parts: list[nn.Module],
+    ) -> tuple[torch.Tensor, torch.Tensor, dict[str, torch.Tensor], dict[str, Any]]:
+        """
+        Post-processing hook after data loading and before model forward pass.
+
+        This method processes the raw data from the dataloader and prepares it for
+        the model's forward pass. It separates the main input tensor from auxiliary
+        inputs and constructs additional keyword arguments (e.g., attention masks).
+
+        Args:
+            input_dict: Dictionary containing tensors from the dataloader. Must
+                contain an "input" key with the main input tensor. May contain
+                additional keys for auxiliary inputs (e.g., position ids).
+            labels: Target labels for the batch.
+            model_parts: List of model parts for accessing model methods.
+
+        Returns:
+            A tuple of (inputs, labels, extra_inputs, extra_kwargs) where:
+                - inputs: Main input tensor extracted from input_dict["input"].
+                - labels: Target labels (potentially modified by CP sharding).
+                - extra_inputs: Dict of auxiliary input tensors (all keys except
+                    "input" from input_dict). These are passed to the model forward
+                    but are NOT forwarded across pipeline parallel stages.
+                - extra_kwargs: Dict of additional keyword arguments for model forward.
+                    These ARE forwarded across pipeline parallel stages. Contains
+                    attention_masks if flex attention is enabled.
+
+        Note:
+            The distinction between extra_inputs and extra_kwargs is important for
+            pipeline parallelism: extra_kwargs are forwarded to all pipeline stages,
+            while extra_inputs are only available to the first stage.
+        """
+        inputs = input_dict["input"]
+        extra_inputs = {k: v for k, v in input_dict.items() if k != "input"}
+        # For arguments, like attention_masks, we have to put them in a separate
+        # dict as extra_inputs are not forwarded to other stages in PP, but
+        # extra_kwargs are.
+        extra_kwargs: dict[str, Any] = {}
+
+        try:
+            # pyrefly: ignore [not-callable]
+            extra_kwargs["attention_masks"] = model_parts[0].get_attention_masks(
+                input_batch=inputs,
+                tokenizer=self.tokenizer,
+                extra_inputs=extra_inputs,
+            )
+        except TypeError:
+            pass
+
+        if self.parallel_dims.cp_enabled:
+            inputs, labels, extra_kwargs = prepare_context_parallel_input(
+                inputs,
+                labels,
+                extra_kwargs,
+                self.parallel_dims.get_mesh("cp"),
+                inputs.device,
+            )
+
+        return inputs, labels, extra_inputs, extra_kwargs
+
     @torch.no_grad()
     # pyrefly: ignore [bad-override]
     def validate(
@@ -117,9 +180,13 @@ def validate(
             self.metrics_processor.ntokens_since_last_log += labels.numel()
             for k, v in input_dict.items():
                 input_dict[k] = v.to(device_type)
-            inputs = input_dict["input"]
             labels = labels.to(device_type)
 
+            # Process data (extract inputs, handle attention masks, CP sharding)
+            inputs, labels, extra_inputs, extra_kwargs = self.post_dataloading_process(
+                input_dict, labels, model_parts
+            )
+
             # Count valid tokens for this batch
             local_valid_tokens = torch.tensor(0, dtype=torch.int64, device=device_type)
             # pyrefly: ignore [missing-attribute]
@@ -150,18 +217,24 @@ def validate(
                 assert self.pp_has_first_stage is not None
                 assert self.pp_has_last_stage is not None
                 # Pipeline Parallel forward inside eval() call
-                with self.validation_context(optional_context_parallel_ctx):
+                with self.validation_context():
                     targets, losses = (
                         (labels, []) if self.pp_has_last_stage else (None, None)
                     )
                     if self.pp_has_first_stage:
                         self.pp_schedule.eval(
                             inputs,
+                            **extra_inputs,
+                            **extra_kwargs,
                             target=targets,
                             losses=losses,
                         )
                     else:
-                        self.pp_schedule.eval(target=targets, losses=losses)
+                        self.pp_schedule.eval(
+                            **extra_kwargs,
+                            target=targets,
+                            losses=losses,
+                        )
 
                 # accumulate losses across pipeline microbatches
                 # TODO: PP+FSDP unexpectedly puts the loss back to the CPU
@@ -172,10 +245,12 @@ def validate(
                     else torch.tensor([-1.0], device=device_type)
                 )
             else:
-                with self.validation_context(optional_context_parallel_ctx):
+                with self.validation_context():
                     assert len(model_parts) == 1
                     with self.maybe_enable_amp:
-                        predictions = model_parts[0](inputs)
+                        predictions = model_parts[0](
+                            inputs, **extra_inputs, **extra_kwargs
+                        )                        
                         loss_sum = self.loss_fn(predictions, labels)
 
             accumulated_losses.append(loss_sum.detach() / global_valid_tokens)
 
@@ -0,0 +1,192 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from collections.abc import Sequence
+from typing import Any, cast
+
+import torch
+import torch.nn as nn
+from torch.distributed.device_mesh import DeviceMesh
+from torch.distributed.tensor.experimental._attention import (
+    _context_parallel_shard,
+    _ContextParallel,
+    _enable_context_parallel_dispatcher,
+    _HeadTailLoadBalancer,
+)
+from torch.distributed.tensor.parallel import parallelize_module
+
+from torchtitan.protocols.model import AttentionMasksType
+from torchtitan.tools.logging import logger
+
+
+def apply_cp_to_attention_module(
+    attention_modules: Sequence[nn.Module],
+    cp_mesh: DeviceMesh,
+    attention_type: str,
+) -> None:
+    """
+    Apply context parallelism to attention modules.
+
+    CP splits the sequence dimension across devices to enable training with
+    longer sequences. This function applies CP to the provided attention
+    modules.
+
+    Args:
+        attention_modules: Sequence of attention modules to apply CP to
+        cp_mesh: Device mesh for context parallel dimension
+        attention_type: Type of attention mechanism. Must be one of:
+            - "sdpa": scaled_dot_product_attention()
+            - "flex": flex_attention()
+            - "varlen": varlen_attn() (not yet implemented)
+
+    Raises:
+        NotImplementedError: If attention_type is "varlen"
+    """
+    # Apply context parallelism to every attention module
+    # TODO: make seq_dim configurable once the implementation doesn't assume 2
+    # internally.
+    match attention_type:
+        case "flex":
+            cp_plan = _ContextParallel(
+                seq_dim=2, attention_type=_ContextParallel.AttentionType.FLEX
+            )
+        case "sdpa":
+            # Enable the DTensor dispatcher to route SDPA operations to the
+            # Context Parallel implementation. This is required for CP to work
+            # with SDPA (but not FlexAttention).
+            # Note: Use _disable_context_parallel_dispatcher() if you need to
+            # turn this off. In TorchTitan, we currently don't disable the CP
+            # dispatcher.
+            _enable_context_parallel_dispatcher()
+            cp_plan = _ContextParallel(
+                seq_dim=2, attention_type=_ContextParallel.AttentionType.SDPA
+            )
+        case "varlen":
+            raise NotImplementedError(
+                "Variable-length attention CP is not yet supported"
+            )
+        case _:
+            raise ValueError(
+                f"Invalid attention_type '{attention_type}'. "
+                f"Must be one of: 'sdpa', 'flex', 'varlen'"
+            )
+
+    for attention_module in attention_modules:
+        parallelize_module(
+            module=attention_module,
+            device_mesh=cp_mesh,
+            parallelize_plan=cp_plan,
+        )
+
+    logger.info("Applied Context Parallel to the model")
+
+
+def prepare_context_parallel_input(
+    inputs: torch.Tensor,
+    labels: torch.Tensor,
+    extra_kwargs: dict[str, Any],
+    cp_mesh: DeviceMesh,
+    device: torch.device,
+) -> tuple[torch.Tensor, torch.Tensor, dict[str, Any]]:
+    """
+    Prepare inputs, labels, and attention masks for Context Parallel forward pass.
+
+    This function prepares tensors for context parallel by:
+    1. Creating position indices based on input sequence length
+    2. Sharding inputs, labels, and positions across the CP mesh
+    3. Sharding attention masks if present
+
+    Args:
+        inputs: Input tensor of shape [batch_size, seq_len]
+        labels: Label tensor of shape [batch_size, seq_len]
+        extra_kwargs: Dictionary that may contain 'attention_masks' to be sharded
+        cp_mesh: Device mesh for context parallel dimension
+        device: Device to create position tensor on
+
+    Returns:
+        Tuple of (sharded_inputs, sharded_labels, updated_extra_kwargs) where:
+            - sharded_inputs: Inputs sharded along sequence dimension
+            - sharded_labels: Labels sharded along sequence dimension
+            - updated_extra_kwargs: Dict with sharded 'positions' and optionally
+              sharded 'attention_masks'
+    """
+    attention_masks = extra_kwargs.get("attention_masks", None)
+    positions = torch.arange(
+        0, inputs.shape[1], dtype=torch.int32, device=device
+    ).expand(inputs.shape)
+    (inputs, labels, positions), attention_masks = cp_shard(
+        cp_mesh,
+        (inputs, labels, positions),
+        attention_masks,
+    )
+    extra_kwargs["positions"] = positions
+    if attention_masks is not None:
+        extra_kwargs["attention_masks"] = attention_masks
+
+    return inputs, labels, extra_kwargs
+
+
+def cp_shard(
+    cp_mesh: DeviceMesh,
+    inputs: tuple[torch.Tensor, ...],
+    attention_masks: AttentionMasksType | None,
+    disable_load_balancer: bool = False,
+    input_seq_dim: int = 1,
+) -> tuple[tuple[torch.Tensor, ...], AttentionMasksType | None]:
+    """
+    Shard inputs and attention masks across the context parallel mesh.
+
+    This function distributes input tensors across devices in the CP mesh
+    along the sequence dimension. It optionally uses a load balancer to
+    handle uneven computation workload. Currently, HeadTailLoadBalancer is
+    used for SDPA + CP, which is the only supported configuration.
+
+    Args:
+        cp_mesh: Device mesh for context parallel dimension
+        inputs: Tuple of input tensors to be sharded along the sequence
+            dimension
+        attention_masks: Attention masks to be sharded (currently raises
+            error as FlexAttention CP is not yet supported)
+        disable_load_balancer: If True, disables load balancing. If False
+            (default), uses HeadTailLoadBalancer for SDPA to handle uneven
+            computation workload.
+        input_seq_dim: Sequence dimension index for sharding. Defaults to 1,
+            which covers most use cases where tensors have shape
+            [batch_size, seq_len, ...]. Can be changed by passing a
+            different value if your tensors use a different sequence
+            dimension layout.
+
+    Returns:
+        Tuple of (sharded_inputs, attention_masks) where:
+            - sharded_inputs: Tuple of input tensors sharded along the
+              sequence dimension
+            - attention_masks: Attention masks (currently unchanged/None)
+    """
+    seq_len = inputs[0].size(input_seq_dim)
+    cp_world_size = cp_mesh.size(0)
+    if attention_masks is not None:
+        raise ValueError(
+            "FlexAttention CP is not supported yet. Will come in the next PR."
+        )
+    else:
+        # For SDPA, we use the _HeadTailLoadBalancer.
+        load_balancer = (
+            None
+            if disable_load_balancer
+            else _HeadTailLoadBalancer(seq_len, cp_world_size, cp_mesh.device_type)
+        )
+
+    inputs = cast(
+        tuple[torch.Tensor, ...],
+        _context_parallel_shard(
+            mesh=cp_mesh,
+            buffers=inputs,
+            seq_dims=tuple(input_seq_dim for _ in inputs),
+            load_balancer=load_balancer,
+        ),
+    )
+
+    return inputs, attention_masks
@@ -224,23 +224,17 @@ def create_context_parallel_ctx(
 
 class TrainContext(Protocol):
     @abstractmethod
-    def __call__(
-        self,
-        cp_context: contextlib.AbstractContextManager[None] | None = None,
-    ) -> contextlib.AbstractContextManager[None]:
+    def __call__(self) -> contextlib.AbstractContextManager[None]:
         pass
 
 
 def get_train_context(enable_loss_parallel: bool) -> TrainContext:
     @contextlib.contextmanager
-    def context(cp_context: contextlib.AbstractContextManager[None] | None = None):
+    def context():
         with contextlib.ExitStack() as stack:
             if enable_loss_parallel:
                 stack.enter_context(torch.distributed.tensor.parallel.loss_parallel())
 
-            if cp_context:
-                stack.enter_context(cp_context)
-
             yield
 
     return context