ServiceNow · jlamypoirier · Dec 16, 2025 · Dec 16, 2025 · Dec 17, 2025 · Dec 17, 2025
diff --git a/fast_llm/core/distributed.py b/fast_llm/core/distributed.py
@@ -72,10 +72,12 @@ def check_parallel_match(tensor: torch.Tensor, group: ProcessGroup | None, name:
         )
 
 
-def safe_barrier(group: ProcessGroup | None, value: int | str = 1, timeout: float | None = None) -> None:
+def safe_barrier(
+    group: ProcessGroup | None, value: int | str = 1, timeout: float | None = None, device: torch.device | None = None
+) -> None:
     if group:
         hashed = hash(value) % 2**32
-        out = allreduce_scalar(hashed, dtype=torch.int64, group=group, timeout=timeout)
+        out = allreduce_scalar(hashed, dtype=torch.int64, group=group, timeout=timeout, device=device)
         if out != hashed * group.size():
             raise RuntimeError(f"Desync detected for barrier {value} ({out}!={hashed*group.size()})")
 
@@ -86,9 +88,10 @@ def allreduce_scalar(
     group: torch.distributed.ProcessGroup | None = None,
     op=ReduceOp.SUM,
     timeout: float | None = None,
+    device: torch.device | None = None,
 ) -> float | int:
     if group:
-        value = torch.full([1], value, dtype=dtype, device=torch.cuda.current_device())
+        value = torch.full([1], value, dtype=dtype, device=torch.cuda.current_device() if device is None else device)
         with set_timeout(group, timeout):
             torch.distributed.all_reduce(value, op=op, group=group)
         return value.item()

diff --git a/fast_llm/engine/schedule/runner.py b/fast_llm/engine/schedule/runner.py
@@ -327,7 +327,9 @@ def _preprocess_data(
         self, context: BatchContext, data_iterator: typing.Iterator, preprocessed: bool
     ) -> typing.Generator[None, None, None]:
         batch_config = context.schedule.batch_config
-        grad_output = (1 if self._optimizer is None else self._optimizer.grad_scale) / batch_config.num_inputs
+        grad_output = (
+            self._optimizer.grad_scale / batch_config.num_inputs if context.schedule.phase.is_training else None
+        )
         for micro_batch in range(batch_config.sequential_micro_batches):
             micro_batch_data = next(data_iterator)
             if not preprocessed:

diff --git a/fast_llm/functional/config.py b/fast_llm/functional/config.py
@@ -93,16 +93,17 @@ def _set_activation_fn_map() -> None:
 MAX_DROPLESS_BLOCK_SIZE_ROW = 128
 
 
-class CrossEntropyImpl(str, enum.Enum):
+class EntropyLossImplementation(enum.StrEnum):
     auto = "auto"
     torch = "torch"
     fused = "fused"
     triton = "triton"
 
 
-class DistillationLossImpl(str, enum.Enum):
-    reverse_kl = "reverse_kl"
+class EntropyLossType(enum.StrEnum):
     cross_entropy = "cross_entropy"
+    forward_kl = "forward_kl"
+    reverse_kl = "reverse_kl"
 
 
 class TargetFormat(enum.StrEnum):