EleutherAI
diff --git a/‎bergson/__main__.py‎
Lines changed: 16 additions & 2 deletions b/‎bergson/__main__.py‎
Lines changed: 16 additions & 2 deletions
diff --git a/‎bergson/collector/collector.py‎
Lines changed: 122 additions & 14 deletions b/‎bergson/collector/collector.py‎
Lines changed: 122 additions & 14 deletions
diff --git a/‎bergson/config.py‎
Lines changed: 18 additions & 0 deletions b/‎bergson/config.py‎
Lines changed: 18 additions & 0 deletions
diff --git a/‎bergson/data.py‎
Lines changed: 29 additions & 6 deletions b/‎bergson/data.py‎
Lines changed: 29 additions & 6 deletions
@@ -6,7 +6,8 @@
 from simple_parsing import ArgumentParser, ConflictResolution
 
 from .build import build
-from .config import IndexConfig, QueryConfig, ReduceConfig, ScoreConfig
+from .config import HessianConfig, IndexConfig, QueryConfig, ReduceConfig, ScoreConfig
+from .hessians.hessian_approximations import approximate_hessians
 from .query.query_index import query
 from .reduce import reduce
 from .score.score import score_dataset
@@ -98,11 +99,24 @@ def execute(self):
         query(self.query_cfg)
 
 
+@dataclass
+class Hessian:
+    """Approximate Hessian matrices using KFAC or EKFAC."""
+
+    hessian_cfg: HessianConfig
+    index_cfg: IndexConfig
+
+    def execute(self):
+        """Compute Hessian approximation."""
+        validate_run_path(self.index_cfg)
+        approximate_hessians(self.index_cfg, self.hessian_cfg)
+
+
 @dataclass
 class Main:
     """Routes to the subcommands."""
 
-    command: Union[Build, Query, Reduce, Score]
+    command: Union[Build, Query, Reduce, Score, Hessian]
 
     def execute(self):
         """Run the script."""
 
@@ -1,11 +1,13 @@
 import functools
+import hashlib
 import os
 from abc import ABC, abstractmethod
 from contextlib import ContextDecorator, nullcontext
 from dataclasses import astuple, dataclass, field
 from fnmatch import fnmatchcase
 from typing import Callable, Literal, Mapping, Optional
 
+import numpy as np
 import torch
 import torch.distributed as dist
 import torch.nn as nn
@@ -24,15 +26,14 @@
 from tqdm.auto import tqdm
 from transformers import PreTrainedModel
 
-from bergson.config import AttentionConfig, IndexConfig
+from bergson.config import AttentionConfig, HessianConfig, IndexConfig
 from bergson.data import pad_and_tensor
 from bergson.gradients import (
     GradientProcessor,
     LayerAdapter,
 )
 from bergson.utils.logger import get_logger
 from bergson.utils.peft import set_peft_enabled
-from bergson.utils.utils import create_projection_matrix
 
 
 @dataclass
@@ -78,6 +79,7 @@ class HookCollectorBase(ContextDecorator, ABC):
     Optional configuration specifying how to split up the attention module gradients
     into per-head gradients. See also bergson.config.AttentionConfig.
     """
+    logger = get_logger("HookCollectorBase", level="INFO")
 
     def __post_init__(
         self,
@@ -256,6 +258,28 @@ def projection(
         self.processor._projection_matrices[key] = A
         return A
 
+    def with_batch(self, valid_mask: Tensor | None = None) -> "HookCollectorBase":
+        """
+        Set the current batch indices and valid mask before entering the context.
+
+        This allows hooks to access batch indices and valid mask during
+        forward/backward passes.
+        Usage:
+            with collector.with_batch(indices, valid_mask):
+                # forward/backward pass
+                # hooks can access self._current_indices and self._current_valid_mask
+
+        Args:
+            indices: List of data indices in the current batch.
+            valid_mask: Optional boolean tensor of shape [batch_size, seq_len]
+                indicating which positions have valid labels for loss computation.
+
+        Returns:
+            self, for use as a context manager.
+        """
+        self._current_valid_mask = valid_mask
+        return self
+
     def __enter__(self):
         """Register forward and backward hooks on all target modules."""
         for name in self.target_info:
@@ -484,15 +508,23 @@ def run_with_collector_hooks(
             ):
                 batch = self.data[indices]
 
+                # Compute padded tensors and valid_mask before entering context
+                x, y, valid_mask = pad_and_tensor(
+                    batch["input_ids"],
+                    labels=batch.get("labels"),
+                    device=self.model.device,
+                )
+                total_processed += valid_mask.sum()
+
                 with (
-                    self.collector,
+                    self.collector.with_batch(valid_mask),
                     (
                         record_function(f"step_{step}")
                         if self.cfg.profile
                         else nullcontext()
                     ),
                 ):
-                    losses = self.forward_backward(self.model, batch)
+                    losses = self.forward_backward(self.model, x, y, batch)
 
                     # TODO: currently builder also calls torch.cuda.synchronize
                     torch.cuda.synchronize() if torch.cuda.is_available() else None
@@ -503,11 +535,17 @@ def run_with_collector_hooks(
                 step += 1
 
                 self.collector.process_batch(indices, losses=losses)
-                total_processed += len(indices)
 
         self.collector.teardown()
+
         if dist.is_initialized():
             dist.all_reduce(total_processed, op=dist.ReduceOp.SUM)
+
+        if self.rank == 0:
+            torch.save(
+                total_processed,
+                os.path.join(self.cfg.partial_run_path, "total_processed.pt"),
+            )
         self.logger.info(f"Total processed: {total_processed.item()}")
 
 
@@ -523,18 +561,17 @@ def fwd_bwd_factory(cfg: IndexConfig) -> Callable:
               summed loss.
 
     Returns:
-        A callable fwd_bwd(model, batch) -> Tensor that performs a forward pass and
-        backward pass, returning the per-sample losses.
-        The batch must contain "input_ids" and optionally "labels" and "advantage".
+        A callable fwd_bwd(model, x, y, batch) -> Tensor that performs a forward pass
+        and backward pass, returning the per-sample losses.
+        Args:
+            model: The model to run forward/backward on.
+            x: Padded input token ids tensor of shape [batch_size, seq_len].
+            y: Padded label tensor of shape [batch_size, seq_len] with -100 for padding.
+            batch: Original batch dict, used only for "advantage" if present.
         Returns a tensor of shape [batch_size] with one loss value per sample.
     """
 
-    def fwd_bwd(model, batch):
-        x, y = pad_and_tensor(
-            batch["input_ids"],  # type: ignore
-            labels=batch.get("labels"),  # type: ignore
-            device=model.device,
-        )
+    def fwd_bwd(model, x: Tensor, y: Tensor, batch: dict):
         logits = model(x).logits[:, :-1]
         masks = y[:, 1:] != -100
         denoms = (
@@ -571,3 +608,74 @@ def fwd_bwd(model, batch):
         return losses
 
     return fwd_bwd
+
+
+def fwd_bwd_hessian_factory(
+    index_cfg: IndexConfig, hessian_cfg: HessianConfig
+) -> Callable:
+    def fwd_bwd_hessian(model, x: Tensor, y: Tensor, batch: dict):
+        logits = model(x).logits[:, :-1]
+        masks = y[:, 1:] != -100
+        denoms = (
+            masks.sum(dim=1, dtype=model.dtype)
+            if index_cfg.loss_reduction == "mean"
+            else 1.0
+        )
+        if hessian_cfg.use_dataset_labels:
+            losses = F.cross_entropy(
+                logits.reshape(-1, logits.size(-1)),
+                y[:, 1:].flatten(),
+                reduction="none",
+            ).reshape_as(y[:, 1:])
+            losses = losses.sum(1) / denoms
+        else:
+            with torch.no_grad():
+                probs = F.softmax(logits, dim=-1)
+                sampled_tokens = torch.multinomial(
+                    probs.reshape(-1, probs.size(-1)),
+                    num_samples=1,
+                    replacement=True,
+                ).reshape_as(y[:, 1:])
+            losses = F.cross_entropy(
+                logits.reshape(-1, logits.size(-1)),
+                sampled_tokens.flatten(),
+                reduction="none",
+            ).reshape_as(y[:, 1:])
+            losses = losses.sum(1) / denoms
+
+        losses.sum().backward()
+        model.zero_grad()
+
+        return losses
+
+    return fwd_bwd_hessian
+
+
+def create_projection_matrix(
+    identifier: str,
+    m: int,
+    n: int,
+    dtype: torch.dtype,
+    device: torch.device,
+    projection_type: Literal["normal", "rademacher"] = "normal",
+) -> Tensor:
+    """Create a projection matrix deterministically based on identifier and side."""
+    # Seed the PRNG with the name of the layer and what "side" we are projecting
+    message = bytes(identifier, "utf-8")
+    digest = hashlib.md5(message).digest()
+    seed = int.from_bytes(digest, byteorder="big") % (2**63 - 1)
+
+    if projection_type == "normal":
+        prng = torch.Generator(device).manual_seed(seed)
+        A = torch.randn(m, n, device=device, dtype=dtype, generator=prng)
+    elif projection_type == "rademacher":
+        numpy_rng = np.random.Generator(np.random.PCG64(seed))
+        random_bytes = numpy_rng.bytes((m * n + 7) // 8)
+        random_bytes = np.frombuffer(random_bytes, dtype=np.uint8)
+        A = np.unpackbits(random_bytes)[: m * n].reshape((m, n))
+        A = torch.from_numpy(A).to(device, dtype=dtype)
+        A = A.add_(-0.5).mul_(2)
+    else:
+        raise ValueError(f"Unknown projection type: {projection_type}")
+    A /= A.norm(dim=1, keepdim=True)
+    return A
@@ -302,6 +302,24 @@ class ReduceConfig:
     """Whether to unit normalize the gradients before reducing them."""
 
 
+@dataclass
+class HessianConfig:
+    """Config for reducing the gradients."""
+
+    method: Literal["kfac", "tkfac", "shampoo"] = "kfac"
+    """Method for approximating the Hessian."""
+
+    ev_correction: bool = False
+    """Whether to additionally compute eigenvalue correction."""
+
+    hessian_dtype: Literal["auto", "bf16", "fp16", "fp32"] = "auto"
+    """Precision (dtype) to use for the Hessian approximation."""
+
+    use_dataset_labels: bool = False
+    """Whether to use dataset labels for Hessian (empirical Fisher) approximation.
+    If false, the model predictions will be used."""
+
+
 @dataclass
 class FaissConfig:
     """Configuration for FAISS index."""
 
@@ -76,6 +76,23 @@ def allocate_batches(
     """
     rank = dist.get_rank() if dist.is_initialized() else 0
     world_size = dist.get_world_size() if dist.is_initialized() else 1
+    (batches,) = _allocate_batches_world(doc_lengths, N, world_size, seed, ranks=[rank])
+    return batches
+
+
+def _allocate_batches_world(
+    doc_lengths: list[int],
+    N: int,
+    world_size: int,
+    seed: int = 42,
+    ranks: list[int] | None = None,
+) -> list[list[list[int]]]:
+    """Lower-level version of allocate_batches that returns batches for specified ranks.
+
+    If ranks is None, returns batches for all ranks.
+    """
+    if ranks is None:
+        ranks = list(range(world_size))
     if len(doc_lengths) < world_size:
         raise RuntimeError("Not enough documents to distribute across workers.")
 
@@ -162,11 +179,12 @@ def allocate_batches(
     # Sanity: equal # of batches per worker
     assert len({len(b) for b in allocation}) == 1
 
-    # Break any systematic ordering of batches
-    random.seed(seed)
-    random.shuffle(allocation[rank])
+    # Break any systematic ordering of batches (shuffle only requested ranks)
+    for rank in ranks:
+        random.seed(seed)
+        random.shuffle(allocation[rank])
 
-    return allocation[rank]
+    return [allocation[rank] for rank in ranks]
 
 
 def create_index(
@@ -466,7 +484,7 @@ def pad_and_tensor(
     padding_value: int = 0,
     dtype: torch.dtype | None = torch.long,
     device: torch.device | None = None,
-) -> tuple[torch.Tensor, torch.Tensor]:
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
     """
     Pad a list of sequences to the same length and convert them to tensors.
     Returns a tuple of padded sequences and labels. The labels are the same as the
@@ -485,7 +503,12 @@ def pad_and_tensor(
     # convert to tensor
     padded_tokens = torch.tensor(padded, dtype=dtype, device=device)
     padded_labels = torch.tensor(labels, dtype=dtype, device=device)
-    return padded_tokens, padded_labels
+    # Compute valid_masks: position i is valid if labels[i+1] != -100
+    N, S = padded_tokens.shape
+    valid_masks = torch.zeros(N, S, dtype=torch.bool, device=device)
+    valid_masks[:, :-1] = padded_labels[:, 1:] != -100
+
+    return padded_tokens, padded_labels, valid_masks
 
 
 def tokenize(batch: dict, *, args: DataConfig, tokenizer):