Skip to content

Commit b68a984

Browse files
committed
Merge branch 'main' into epv1_fp8_noquant
2 parents 1856e0e + 811ba01 commit b68a984

File tree

23 files changed

+2270
-356
lines changed

23 files changed

+2270
-356
lines changed

.github/workflows/ci.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ jobs:
66
build:
77
name: mori build
88
runs-on: label-1
9+
timeout-minutes: 10
910
steps:
1011
- name: clone source code
1112
run: |
@@ -27,14 +28,19 @@ jobs:
2728
#pip install --target /apps/mori-ci/packages -r requirements-build.txt
2829
pip install -r requirements-build.txt
2930
#git submodule update --init --recursive
31+
#it indicates the use of a mellanox NIC by default, it has been verified on MI300X/MI355X.
3032
#pip3 install . --no-build-isolation
33+
#The string "BNXT" indicates the use of a Thor 2 NIC, it has been verified on MI300X/MI355X.
34+
#USE_BNXT=ON pip3 install . --no-build-isolation
35+
#The string "IONIC" indicates the use of a AINIC, it has been verified on MI300X/MI355X.
3136
USE_IONIC=ON pip3 install . --no-build-isolation
3237
# --target /apps/mori-ci/python-packages
3338
scp -P 2233 -r /root/actions-runner/_work/mori/mori/mori root@smci355-ccs-aus-n08-33:/root/actions-runner/_work/mori/mori/
3439
test:
3540
name: mori test
3641
needs: build
3742
runs-on: label-1
43+
timeout-minutes: 10
3844
steps:
3945
#- uses: actions/checkout@v4
4046
#- name: Set up Python

CMakeLists.txt

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@ option(BUILD_PYBINDS "Whether to build mori python bindings" ON)
3131
option(BUILD_TESTS "Whether to build mori CPP tests" ON)
3232
option(ENABLE_PROFILER "Enable kernel profiling" OFF)
3333
option(ENABLE_DEBUG_PRINTF "Enable debug printf in device kernels" OFF)
34+
option(ENABLE_STANDARD_MOE_ADAPT "Enable standard moe adapt" OFF)
3435

3536
set(CMAKE_LIBRARY_PATH "/usr/local/lib;${CMAKE_LIBRARY_PATH}")
3637
find_library(BNXT_RE_LIB NAMES bnxt_re)
@@ -76,6 +77,11 @@ if(ENABLE_DEBUG_PRINTF)
7677
add_compile_definitions(MORI_ENABLE_DEBUG_PRINTF)
7778
endif()
7879

80+
message(STATUS "ENABLE_STANDARD_MOE_ADAPT = ${ENABLE_STANDARD_MOE_ADAPT}")
81+
if(ENABLE_STANDARD_MOE_ADAPT)
82+
add_compile_definitions(ENABLE_STANDARD_MOE_ADAPT)
83+
endif()
84+
7985
if(NOT DEFINED WARP_ACCUM_UNROLL)
8086
set(WARP_ACCUM_UNROLL 1)
8187
endif()

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,11 @@
44

55
**MORI** (**Mo**dular **R**DMA **I**nterface) is a **bottom-up, modular, and composable framework** for building high-performance communication applications with a strong focus on **RDMA + GPU integration**. Inspired by the role of MLIR in compiler infrastructure, MORI provides reusable and extensible building blocks that make it **easier for developers to adopt advanced techniques** such as IBGDA (Infiniband GPUDirect Async) and GDS (GPUDirect Storage).
66

7-
To help developers get started quickly, MORI also includes a suite of optimized libraries—**MORI-EP** (MoE dispatch & combine kernels), **MORI-IO** (p2p communication for KVCache transfer), and **MORI-CCL** (collective communication)—that deliver out-of-the-box performance.
7+
To help developers get started quickly, MORI also includes a suite of optimized libraries—**MORI-EP** (MoE dispatch & combine kernels), **MORI-IO** (p2p communication for KVCache transfer), and **MORI-CCL** (collective communication)—that deliver out-of-the-box performance, with support for AMD `Pensando DSC`, Broadcom `Thor2`, and NVIDIA Mellanox `ConnectX-7` NICs.
88

99
Feature summary:
1010
- Applications
11-
- MORI-EP: intra and inter-node dispatch/combine kernels with SOTA performance
11+
- MORI-EP: intra and inter-node dispatch/combine kernels with SOTA performance.
1212
- MORI-IO: point-to-point communication library with ultra-low overhead
1313
- MORI-CCL: lightweight and flexible collective communication library designed for highly customized use cases such as latency-sensitive or resource-constrained environment
1414
- Framework

examples/dist_rdma_ops/dist_write.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -431,7 +431,7 @@ int GetGpuidByNicName(std::string nic_name) {
431431
auto* gpuPci = pciSys->Node(gpu->busId);
432432
auto* nicPci = pciSys->Node(nic->busId);
433433
#if 0
434-
if (!path) {
434+
if (!path) {
435435
printf("gpu %s nic %s no direct link\n", gpu->busId.String().c_str(),
436436
nic->busId.String().c_str());
437437
} else {

examples/ops/dispatch_combine/test_dispatch_combine.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -403,10 +403,10 @@ class EpDispatchCombineTestCase {
403403
InitializeHandle();
404404
SystemBarrier();
405405

406-
handle.LaunchDispatch(runConfig.kernelType, -1, -1, stream);
406+
handle.LaunchDispatch(runConfig.kernelType, -1, -1, -1, stream);
407407
CopyDispatchOutAsCombineInp();
408408
SystemBarrier();
409-
handle.LaunchCombine(runConfig.kernelType, -1, -1, stream);
409+
handle.LaunchCombine(runConfig.kernelType, -1, -1, -1, -1, stream);
410410
if (handle.config.rank == 0) std::cout << "Warmup Done" << std::endl;
411411
}
412412

@@ -420,7 +420,7 @@ class EpDispatchCombineTestCase {
420420
SystemBarrier();
421421

422422
HIP_RUNTIME_CHECK(hipEventRecord(events[0]));
423-
handle.LaunchDispatch(runConfig.kernelType, -1, -1, stream);
423+
handle.LaunchDispatch(runConfig.kernelType, -1, -1, -1, stream);
424424
HIP_RUNTIME_CHECK(hipEventRecord(events[1]));
425425

426426
HIP_RUNTIME_CHECK(hipMemcpy(&dispatchTotalRecvTokenNum, handle.totalRecvTokenNum, sizeof(int),
@@ -429,7 +429,7 @@ class EpDispatchCombineTestCase {
429429
SystemBarrier();
430430

431431
HIP_RUNTIME_CHECK(hipEventRecord(events[2]));
432-
handle.LaunchCombine(runConfig.kernelType, -1, -1, stream);
432+
handle.LaunchCombine(runConfig.kernelType, -1, -1, -1, -1, stream);
433433
HIP_RUNTIME_CHECK(hipEventRecord(events[3]));
434434

435435
float dispatch, combine;

0 commit comments

Comments
 (0)