ROCm
diff --git a/‎examples/ops/dispatch_combine/test_dispatch_combine.py‎
Lines changed: 72 additions & 11 deletions b/‎examples/ops/dispatch_combine/test_dispatch_combine.py‎
Lines changed: 72 additions & 11 deletions
diff --git a/‎include/mori/application/transport/rdma/rdma.hpp‎
Lines changed: 6 additions & 0 deletions b/‎include/mori/application/transport/rdma/rdma.hpp‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎include/mori/core/transport/p2p/device_primitives.hpp‎
Lines changed: 52 additions & 47 deletions b/‎include/mori/core/transport/p2p/device_primitives.hpp‎
Lines changed: 52 additions & 47 deletions
diff --git a/‎src/application/transport/rdma/providers/bnxt/bnxt.cpp‎
Lines changed: 4 additions & 3 deletions b/‎src/application/transport/rdma/providers/bnxt/bnxt.cpp‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎src/application/transport/rdma/providers/ibverbs/ibverbs.cpp‎
Lines changed: 15 additions & 4 deletions b/‎src/application/transport/rdma/providers/ibverbs/ibverbs.cpp‎
Lines changed: 15 additions & 4 deletions
@@ -28,15 +28,23 @@
 
 os.environ["MORI_SHMEM_HEAP_SIZE"] = "6G"
 
+
+def _is_fp4x2_dtype(dtype):
+    return dtype is torch.float4_e2m1fn_x2
+
+
 class EpDispatchCombineTestCase:
-    def __init__(self, rank, world_size, dtype=torch.bfloat16):
+    def __init__(self, rank, world_size, dtype=torch.bfloat16, quant_type="none", hidden_dim=7168):
         self.rank = rank
         self.world_size = world_size
+        # fp8_direct_cast requires use_external_inp_buf=True (not zero-copy)
+        use_external_inp_buf = (quant_type == "fp8_direct_cast")
+        cfg_hidden_dim = hidden_dim // 2 if _is_fp4x2_dtype(dtype) else hidden_dim
         self.config = mori.ops.EpDispatchCombineConfig(
             data_type=dtype,
             rank=self.rank,
             world_size=self.world_size,
-            hidden_dim=7168,
+            hidden_dim=cfg_hidden_dim,
             # scale_dim=32,
             scale_dim=0,
             scale_type_size=torch.tensor(
@@ -46,7 +54,8 @@ def __init__(self, rank, world_size, dtype=torch.bfloat16):
             max_num_inp_token_per_rank=4096,
             num_experts_per_rank=32,
             num_experts_per_token=8,
-            use_external_inp_buf=False,
+            use_external_inp_buf=use_external_inp_buf,
+            quant_type=quant_type,
         )
 
     def setup(self):
@@ -177,10 +186,22 @@ def gen_test_data(self):
             generator=self.rng,
             device=self.device,
         )
+        if _is_fp4x2_dtype(self.config.data_type):
+            input_bytes = torch.randint(
+                0,
+                256,
+                (num_tokens, self.config.hidden_dim),
+                dtype=torch.uint8,
+                generator=self.rng,
+                device=self.device,
+            )
+            input = input_bytes.view(torch.float4_e2m1fn_x2)
+        else:
+            input = input_fp32.to(self.config.data_type)
+
         input_list = self._allgather_with_token_num_padding(
-            input_fp32, self.config.max_num_inp_token_per_rank
+            input, self.config.max_num_inp_token_per_rank
         )
-        input_list = [tensor.to(self.config.data_type) for tensor in input_list]
 
         return (
             num_tokens,
@@ -189,7 +210,7 @@ def gen_test_data(self):
             # None,
             # scales_fp32,
             scales_fp32.to(torch.float8_e4m3fnuz),
-            input_fp32.to(self.config.data_type),
+            input,
             indices_list,
             weights_list,
             # None,
@@ -233,7 +254,13 @@ def run_test_once(self, op, test_data):
         for i, pos in enumerate(src_token_pos):
             src_rank = int(pos) // self.config.max_num_inp_token_per_rank
             src_id = int(pos) % self.config.max_num_inp_token_per_rank
-            assert torch.equal(input_list[src_rank][src_id], dispatch_output[i])
+            if _is_fp4x2_dtype(self.config.data_type):
+                assert torch.equal(
+                    input_list[src_rank][src_id].view(torch.uint8),
+                    dispatch_output[i].view(torch.uint8),
+                )
+            else:
+                assert torch.equal(input_list[src_rank][src_id], dispatch_output[i])
             assert torch.equal(weights_list[src_rank][src_id], dispatch_weights[i])
             if scales_list is not None and self.config.scale_dim != 0:
                 assert torch.equal(scales_list[src_rank][src_id], dispatch_scales[i])
@@ -263,6 +290,8 @@ def run_test_once(self, op, test_data):
         torch.cuda.synchronize()
 
         for i in range(num_tokens):
+            # if _is_fp4x2_dtype(self.config.data_type):
+            #     continue
             pes = [
                 (idx // self.config.num_experts_per_rank)
                 for idx in indices[i].cpu().tolist()
@@ -274,7 +303,10 @@ def run_test_once(self, op, test_data):
             # ).to(self.config.data_type)
             got, expected = combine_output[i], input[i].to(torch.bfloat16) * unique_pes
 
-            assert torch.allclose(got.float(), expected.float(), atol=1e-2, rtol=1e-2)
+            atol, rtol = 1e-2, 1e-2
+            if self.config.quant_type == "fp8_direct_cast":
+                atol, rtol = 1e-1, 1e-1
+            assert torch.allclose(got.float(), expected.float(), atol=atol, rtol=rtol)
 
             got_weight, expected_weight = (
                 combine_output_weight[i],
@@ -309,16 +341,45 @@ def test_dispatch_combine(self):
         del op
 
 
-def test_dispatch_combine(rank, world_size):
+def test_dispatch_combine(rank, world_size, dtype, quant_type="none"):
     # test_case = EpDispatchCombineTestCase(rank, world_size, torch.float8_e4m3fnuz)
-    test_case = EpDispatchCombineTestCase(rank, world_size, torch.bfloat16)
+    test_case = EpDispatchCombineTestCase(rank, world_size, dtype, quant_type)
     test_case.setup()
     test_case.test_dispatch_combine()
     test_case.cleanup()
 
 
 if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="bf16",
+        choices=["bf16", "fp4"],
+        help="Data type of dispatch / combine",
+    )
+    parser.add_argument(
+        "--quant-type",
+        type=str,
+        default="none",
+        choices=["none", "fp8_direct_cast"],
+        help="Quantization method used inside Combine.",
+    )
+    args = parser.parse_args()
+
+    _DATA_TYPE_MAP = {
+        "bf16": torch.bfloat16,
+        "fp4": torch.float4_e2m1fn_x2,
+    }
+    if args.quant_type == "fp8_direct_cast" and _DATA_TYPE_MAP[args.dtype] is torch.float4_e2m1fn_x2:
+        raise ValueError("fp8_direct_cast is not supported for fp4 data type")
+
     world_size = 8
     torch.multiprocessing.spawn(
-        test_dispatch_combine, args=(world_size,), nprocs=world_size, join=True
+        test_dispatch_combine,
+        args=(world_size, _DATA_TYPE_MAP[args.dtype], args.quant_type),
+        nprocs=world_size,
+        join=True,
     )
@@ -190,6 +190,12 @@ class RdmaDevice;
 
 std::optional<uint8_t> ReadRdmaServiceLevelEnv();
 std::optional<uint8_t> ReadRdmaTrafficClassEnv();
+std::optional<uint8_t> ReadIoServiceLevelEnv();
+std::optional<uint8_t> ReadIoTrafficClassEnv();
+bool ReadIoTrafficClassDisableEnv();
+
+bool ReadIbEnableRelaxedOrderingEnv();
+int MaybeAddRelaxedOrderingFlag(int accessFlag);
 
 /* -------------------------------------------------------------------------- */
 /*                              RdmaDeviceContext                             */
 
@@ -796,11 +796,11 @@ __forceinline__ __device__ void WarpCastBf16ToCombineInternalFp8(
       }
     }
   }
+  // Note: when T != hip_bfloat16, this function is a no-op.
+  // Callers should guard with if constexpr or ensure T is hip_bfloat16.
 #else
-  (void)dst;
-  (void)src;
-  (void)hiddenDim;
-  (void)laneId;
+  static_assert(!sizeof(T*), "WarpCastBf16ToCombineInternalFp8 requires FP8 type support "
+                              "(MORI_FP8_TYPE_OCP_ENABLED or MORI_FP8_TYPE_FNUZ_ENABLED)");
 #endif
 }
 
@@ -809,31 +809,28 @@ namespace detail {
 using CombineInternalFp8T = CombineInternalFp8;
 using CombineInternalFp8x4T = CombineInternalFp8x4;
 
-template <int NNodes>
-__forceinline__ __device__ void SumCombineInternalFp8AcrossNodesToBf16Fixed(
-    hip_bfloat16* __restrict__ out, const CombineInternalFp8T* const* __restrict__ srcPtrs,
-    int laneId, int hiddenDimSize);
-
-template <>
-__forceinline__ __device__ void SumCombineInternalFp8AcrossNodesToBf16Fixed<2>(
+template <int AccumNum>
+__forceinline__ __device__ void WarpAccumCombineInternalFp8ToBf16Fixed(
     hip_bfloat16* __restrict__ out, const CombineInternalFp8T* const* __restrict__ srcPtrs,
     int laneId, int hiddenDimSize) {
+  static_assert(AccumNum > 0, "AccumNum must be positive");
+
   using Fp8T = CombineInternalFp8T;
   using Fp8x4T = CombineInternalFp8x4T;
   constexpr int kVec8 = 8;
   constexpr int kVec4 = 4;
 
-  const Fp8T* src0 = srcPtrs[0];
-  const Fp8T* src1 = srcPtrs[1];
-
   const uintptr_t outAddr = reinterpret_cast<uintptr_t>(out);
-  const uintptr_t src0Addr = reinterpret_cast<uintptr_t>(src0);
-  const uintptr_t src1Addr = reinterpret_cast<uintptr_t>(src1);
-
-  const bool canVec8 = ((outAddr & 0x7) == 0) && ((src0 == nullptr) || ((src0Addr & 0x7) == 0)) &&
-                       ((src1 == nullptr) || ((src1Addr & 0x7) == 0));
-  const bool canVec4 = ((src0 == nullptr) || ((src0Addr & 0x3) == 0)) &&
-                       ((src1 == nullptr) || ((src1Addr & 0x3) == 0));
+  bool canVec8 = ((outAddr & 0x7) == 0);
+  bool canVec4 = true;
+#pragma unroll
+  for (int n = 0; n < AccumNum; n++) {
+    const Fp8T* src = srcPtrs[n];
+    if (src == nullptr) continue;
+    const uintptr_t srcAddr = reinterpret_cast<uintptr_t>(src);
+    canVec8 &= ((srcAddr & 0x7) == 0);
+    canVec4 &= ((srcAddr & 0x3) == 0);
+  }
 
   const int vecEnd8 = (hiddenDimSize / kVec8) * kVec8;
   const int vecEnd4 = (hiddenDimSize / kVec4) * kVec4;
@@ -846,7 +843,7 @@ __forceinline__ __device__ void SumCombineInternalFp8AcrossNodesToBf16Fixed<2>(
       float4 sumLo = {0.0f, 0.0f, 0.0f, 0.0f};
       float4 sumHi = {0.0f, 0.0f, 0.0f, 0.0f};
 #pragma unroll
-      for (int n = 0; n < 2; n++) {
+      for (int n = 0; n < AccumNum; n++) {
         const Fp8T* src = srcPtrs[n];
         if (src == nullptr) continue;
         const auto* srcAligned = static_cast<const Fp8T*>(__builtin_assume_aligned(src, 8));
@@ -892,7 +889,7 @@ __forceinline__ __device__ void SumCombineInternalFp8AcrossNodesToBf16Fixed<2>(
       for (int j = vecEnd8 + laneId * kVec4; j < vecEnd4; j += warpSize * kVec4) {
         float4 sum4 = {0.0f, 0.0f, 0.0f, 0.0f};
 #pragma unroll
-        for (int n = 0; n < 2; n++) {
+        for (int n = 0; n < AccumNum; n++) {
           const Fp8T* src = srcPtrs[n];
           if (src == nullptr) continue;
           Fp8x4T v;
@@ -914,7 +911,7 @@ __forceinline__ __device__ void SumCombineInternalFp8AcrossNodesToBf16Fixed<2>(
     for (int j = laneId * kVec4; j < vecEnd4; j += warpSize * kVec4) {
       float4 sum4 = {0.0f, 0.0f, 0.0f, 0.0f};
 #pragma unroll
-      for (int n = 0; n < 2; n++) {
+      for (int n = 0; n < AccumNum; n++) {
         const Fp8T* src = srcPtrs[n];
         if (src == nullptr) continue;
         Fp8x4T v;
@@ -936,7 +933,7 @@ __forceinline__ __device__ void SumCombineInternalFp8AcrossNodesToBf16Fixed<2>(
   for (int j = scalarStart + laneId; j < hiddenDimSize; j += warpSize) {
     float sum = 0.0f;
 #pragma unroll
-    for (int n = 0; n < 2; n++) {
+    for (int n = 0; n < AccumNum; n++) {
       const Fp8T* src = srcPtrs[n];
       if (src == nullptr) continue;
       sum += float(src[j]);
@@ -945,9 +942,9 @@ __forceinline__ __device__ void SumCombineInternalFp8AcrossNodesToBf16Fixed<2>(
   }
 }
 
-__forceinline__ __device__ void SumCombineInternalFp8AcrossNodesToBf16Dynamic(
+__forceinline__ __device__ void WarpAccumCombineInternalFp8ToBf16Dynamic(
     hip_bfloat16* __restrict__ out, const CombineInternalFp8T* const* __restrict__ srcPtrs,
-    int nNodes, int laneId, int hiddenDimSize) {
+    int accumNum, int laneId, int hiddenDimSize) {
   using Fp8T = CombineInternalFp8T;
   using Fp8x4T = CombineInternalFp8x4T;
 
@@ -956,7 +953,7 @@ __forceinline__ __device__ void SumCombineInternalFp8AcrossNodesToBf16Dynamic(
 
   bool canVec4 = true;
 #pragma unroll 4
-  for (int n = 0; n < nNodes; n++) {
+  for (int n = 0; n < accumNum; n++) {
     const Fp8T* src = srcPtrs[n];
     if (src == nullptr) continue;
     canVec4 &= ((reinterpret_cast<uintptr_t>(src) & 0x3) == 0);
@@ -966,7 +963,7 @@ __forceinline__ __device__ void SumCombineInternalFp8AcrossNodesToBf16Dynamic(
     for (int j = laneId * kVec4; j < vecEnd; j += warpSize * kVec4) {
       float4 sum4 = {0.0f, 0.0f, 0.0f, 0.0f};
 #pragma unroll 4
-      for (int n = 0; n < nNodes; n++) {
+      for (int n = 0; n < accumNum; n++) {
         const Fp8T* src = srcPtrs[n];
         if (src == nullptr) continue;
         Fp8x4T v;
@@ -988,7 +985,7 @@ __forceinline__ __device__ void SumCombineInternalFp8AcrossNodesToBf16Dynamic(
   for (int j = scalarStart + laneId; j < hiddenDimSize; j += warpSize) {
     float sum = 0.0f;
 #pragma unroll 4
-    for (int n = 0; n < nNodes; n++) {
+    for (int n = 0; n < accumNum; n++) {
       const Fp8T* src = srcPtrs[n];
       if (src == nullptr) continue;
       sum += float(src[j]);
@@ -1001,29 +998,37 @@ __forceinline__ __device__ void SumCombineInternalFp8AcrossNodesToBf16Dynamic(
 #endif
 
 template <typename T>
-__forceinline__ __device__ void SumCombineInternalFp8AcrossNodesToBf16(
-    T* __restrict__ out, const CombineInternalFp8* const* __restrict__ srcPtrs, int nNodes,
+__forceinline__ __device__ void WarpAccumCombineInternalFp8ToBf16(
+    T* __restrict__ out, const CombineInternalFp8* const* __restrict__ srcPtrs, int accumNum,
     int laneId, int hiddenDimSize) {
 #if defined(MORI_FP8_TYPE_OCP_ENABLED) || defined(MORI_FP8_TYPE_FNUZ_ENABLED)
   if constexpr (std::is_same_v<T, hip_bfloat16>) {
-    if (nNodes == 2) {
-      detail::SumCombineInternalFp8AcrossNodesToBf16Fixed<2>(
-          reinterpret_cast<hip_bfloat16*>(out),
-          reinterpret_cast<const detail::CombineInternalFp8T* const*>(srcPtrs), laneId,
-          hiddenDimSize);
-    } else {
-      detail::SumCombineInternalFp8AcrossNodesToBf16Dynamic(
-          reinterpret_cast<hip_bfloat16*>(out),
-          reinterpret_cast<const detail::CombineInternalFp8T* const*>(srcPtrs), nNodes, laneId,
-          hiddenDimSize);
+    switch (accumNum) {
+      case 2:
+        detail::WarpAccumCombineInternalFp8ToBf16Fixed<2>(
+            reinterpret_cast<hip_bfloat16*>(out),
+            reinterpret_cast<const detail::CombineInternalFp8T* const*>(srcPtrs), laneId,
+            hiddenDimSize);
+        break;
+      case 8:
+        detail::WarpAccumCombineInternalFp8ToBf16Fixed<8>(
+            reinterpret_cast<hip_bfloat16*>(out),
+            reinterpret_cast<const detail::CombineInternalFp8T* const*>(srcPtrs), laneId,
+            hiddenDimSize);
+        break;
+      default:
+        detail::WarpAccumCombineInternalFp8ToBf16Dynamic(
+            reinterpret_cast<hip_bfloat16*>(out),
+            reinterpret_cast<const detail::CombineInternalFp8T* const*>(srcPtrs), accumNum, laneId,
+            hiddenDimSize);
+        break;
     }
   }
+  // Note: when T != hip_bfloat16, this function is a no-op.
+  // Callers should guard with if constexpr or ensure T is hip_bfloat16.
 #else
-  (void)out;
-  (void)srcPtrs;
-  (void)nNodes;
-  (void)laneId;
-  (void)hiddenDimSize;
+  static_assert(!sizeof(T*), "WarpAccumCombineInternalFp8ToBf16 requires FP8 type support "
+                              "(MORI_FP8_TYPE_OCP_ENABLED or MORI_FP8_TYPE_FNUZ_ENABLED)");
 #endif
 }
 
 
@@ -286,9 +286,10 @@ BnxtQpContainer::BnxtQpContainer(ibv_context* context, const RdmaEndpointConfig&
   }
 
   // Register atomic ibuf as independent memory region
-  atomicIbufMr = ibv_reg_mr(pd, atomicIbufAddr, atomicIbufSize,
-                            IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE |
-                                IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC);
+  int atomicIbufAccessFlag =
+      MaybeAddRelaxedOrderingFlag(IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_REMOTE_WRITE |
+                                  IBV_ACCESS_REMOTE_READ | IBV_ACCESS_REMOTE_ATOMIC);
+  atomicIbufMr = ibv_reg_mr(pd, atomicIbufAddr, atomicIbufSize, atomicIbufAccessFlag);
   assert(atomicIbufMr);
 
   MORI_APP_TRACE(
 
@@ -133,10 +133,21 @@ void IBVerbsDeviceContext::ConnectEndpoint(const RdmaEndpointHandle& local,
   attr.min_rnr_timer = 12;
   attr.ah_attr.src_path_bits = 0;
   attr.ah_attr.port_num = local.portId;
-  attr.ah_attr.sl = ReadRdmaServiceLevelEnv().value_or(0);
-  std::optional<uint8_t> tc = ReadRdmaTrafficClassEnv();
-  if (tc.has_value()) {
-    attr.ah_attr.grh.traffic_class = tc.value();
+  std::optional<uint8_t> sl = ReadIoServiceLevelEnv();
+  if (!sl.has_value()) {
+    sl = ReadRdmaServiceLevelEnv();
+  }
+  attr.ah_attr.sl = sl.value_or(0);
+
+  bool disableIoTc = ReadIoTrafficClassDisableEnv();
+  if (!disableIoTc) {
+    std::optional<uint8_t> tc = ReadIoTrafficClassEnv();
+    if (!tc.has_value()) {
+      tc = ReadRdmaTrafficClassEnv();
+    }
+    if (tc.has_value()) {
+      attr.ah_attr.grh.traffic_class = tc.value();
+    }
   }
   MORI_APP_INFO("ibverbs attr.ah_attr.sl:{} attr.ah_attr.grh.traffic_class:{}", attr.ah_attr.sl,
                 attr.ah_attr.grh.traffic_class);