Skip to content

Commit e11679e

Browse files
mustafabarsystems-assistant[bot]
authored andcommitted
[RCCL] Fix netOverride being skipped when rail trees are
enabled (restore desired NIC mapping for targeted 4-NIC systems) (#4726) ## Motivation This PR resolves a performance issue where enabling rail-optimized trees caused an early return, skipping the netOverride logic on affected systems. On some 4-NIC MI3xx systems, this manifested as: - enabling rail trees appearing to regress ring performance - inconsistent behavior depending on whether rail trees were selected `netOverride` is required for pairing GPUs with no PXB path to NICs on these systems. ## Technical Details 1. Remove early return after treeRail success 2. Refactor override into helper `applyNetOverride(system, romeTopoModels[i].options);` 3. Apply override after graph construction 4. Now runs regardless of what ring/tree are matched 5. Add runtime control `RCCL_DISABLE_NET_OVERRIDE` for debugging issues with netOverride due to hard assumptions about system layout ## JIRA ID ROCM-3200 ## Test Plan <!-- Explain any relevant testing done to verify this PR. --> ## Test Result <!-- Briefly summarize test outcomes. --> ## Submission Checklist - [ ] Look over the contributing guidelines at https://github.com/ROCm/ROCm/blob/develop/CONTRIBUTING.md#pull-requests. [rocm-systems] ROCm/rocm-systems#4726 (commit f66e1c0)
1 parent 22cac7b commit e11679e

File tree

1 file changed

+42
-42
lines changed

1 file changed

+42
-42
lines changed

src/graph/rome_models.cc

Lines changed: 42 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -2457,6 +2457,35 @@ int checkAlltoallWidth(struct rcclRomeModel *romeTopo) {
24572457
}
24582458

24592459
RCCL_PARAM(DisableRailTrees, "DISABLE_RAIL_TREES", 0);
2460+
RCCL_PARAM(DisableNetOverride, "DISABLE_NET_OVERRIDE", 0);
2461+
2462+
static void applyNetOverride(struct ncclTopoSystem* system, const char* options) {
2463+
if (rcclParamDisableNetOverride()) return;
2464+
if (!checkOption(options, "netOverride")) return;
2465+
// This override assumes a fixed layout: each NIC hangs off a PCIe switch that
2466+
// also connects a single GPU, and the non-paired GPUs (usually >= PATH_PHB)
2467+
// are paired by gpu.dev/2. On systems with different switch fan-out, mixed
2468+
// placement, or numbering, this can mis-classify paths. Use
2469+
// RCCL_DISABLE_NET_OVERRIDE to opt out.
2470+
// Kernel patch https://lkml.org/lkml/2024/6/12/551 is recommended to avoid having to
2471+
// pair GPUs with NICs (check xml.cc:getBcmLinks)
2472+
// Applying kernel patch should pick up rome_model_88 in the 8-GPU/4-NIC that does not
2473+
// apply this override, but we keep this override for other systems that may benefit from it.
2474+
for (int i = 0; i < system->nodes[NET].count; i++) {
2475+
for (int j = 0; j < system->nodes[GPU].count; j++) {
2476+
if (system->nodes[GPU].nodes[j].paths[NET][i].type == PATH_PXB) {
2477+
int k;
2478+
for (k = 0; k < system->nodes[GPU].count; k++) {
2479+
if (k != j &&
2480+
system->nodes[GPU].nodes[k].gpu.dev/2 == system->nodes[GPU].nodes[j].gpu.dev/2)
2481+
break;
2482+
}
2483+
if (k < system->nodes[GPU].count)
2484+
system->nodes[GPU].nodes[k].paths[NET][i].type = PATH_PXB;
2485+
}
2486+
}
2487+
}
2488+
}
24602489

24612490
ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* graph, const char *ringBase) {
24622491
constexpr int NUMA_CPUS = 2;
@@ -2623,35 +2652,21 @@ ncclResult_t parseA2a8P(struct ncclTopoSystem* system, struct ncclTopoGraph* gra
26232652
NCCLCHECK(parseGraph(romeTopoModels[i].treeRail, system, graph, g8, nnets > 1 ? n : NULL, 0));
26242653
if (graph->nChannels) {
26252654
system->useRailOptimizedTrees = true;
2626-
return ncclSuccess;
26272655
}
26282656
}
26292657

2630-
// Fall back to looking for tree configuration from treeBase
2631-
if (romeTopoModels[i].treeBase != nullptr) {
2632-
NCCLCHECK(parseGraphLight(romeTopoModels[i].treeBase, system, graph, rdm.data()));
2633-
if (graph->nChannels) return ncclSuccess;
2658+
if (!graph->nChannels) {
2659+
// Fall back to looking for tree configuration from treeBase
2660+
if (romeTopoModels[i].treeBase != nullptr) {
2661+
NCCLCHECK(parseGraphLight(romeTopoModels[i].treeBase, system, graph, rdm.data()));
2662+
}
26342663
}
26352664

2636-
// Fall back to tree from ringBase
2637-
NCCLCHECK(parseGraph(ringBase != nullptr ? ringBase : romeTopoModels[i].ringBase, system, graph, rdm.data(), nnets > 1 ? n : NULL, 0));
2638-
// Override GDR distance if requested
2639-
if (checkOption(romeTopoModels[i].options, "netOverride")) {
2640-
for (int i = 0; i < system->nodes[NET].count; i++) {
2641-
for (int j = 0; j < system->nodes[GPU].count; j++) {
2642-
if (system->nodes[GPU].nodes[j].paths[NET][i].type == PATH_PXB) {
2643-
int k;
2644-
for (k = 0; k < system->nodes[GPU].count; k++) {
2645-
if (k != j &&
2646-
system->nodes[GPU].nodes[k].gpu.dev/2 == system->nodes[GPU].nodes[j].gpu.dev/2)
2647-
break;
2648-
}
2649-
if (k < system->nodes[GPU].count)
2650-
system->nodes[GPU].nodes[k].paths[NET][i].type = PATH_PXB;
2651-
}
2652-
}
2653-
}
2665+
if (!graph->nChannels) {
2666+
// Fall back to tree from ringBase
2667+
NCCLCHECK(parseGraph(ringBase != nullptr ? ringBase : romeTopoModels[i].ringBase, system, graph, rdm.data(), nnets > 1 ? n : NULL, 0));
26542668
}
2669+
applyNetOverride(system, romeTopoModels[i].options);
26552670
break;
26562671
}
26572672

@@ -2814,28 +2829,13 @@ ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, struct ncclTopoGraph*
28142829
case NCCL_TOPO_PATTERN_BALANCED_TREE:
28152830
if (romeTopoModels[i].treeBase != nullptr) {
28162831
NCCLCHECK(parseGraphLight(romeTopoModels[i].treeBase, system, graph, g));
2817-
if (graph->nChannels) return ncclSuccess;
28182832
}
28192833

2820-
// Fall back to tree from ringBase
2821-
NCCLCHECK(parseGraph(ringBase != nullptr ? ringBase : romeTopoModels[i].ringBase, system, graph, rdm, nnets > 1 ? n : NULL, 0));
2822-
// Override GDR distance if requested
2823-
if (checkOption(romeTopoModels[i].options, "netOverride")) {
2824-
for (int i = 0; i < system->nodes[NET].count; i++) {
2825-
for (int j = 0; j < system->nodes[GPU].count; j++) {
2826-
if (system->nodes[GPU].nodes[j].paths[NET][i].type == PATH_PXB) {
2827-
int k;
2828-
for (k = 0; k < system->nodes[GPU].count; k++) {
2829-
if (k != j &&
2830-
system->nodes[GPU].nodes[k].gpu.dev/2 == system->nodes[GPU].nodes[j].gpu.dev/2)
2831-
break;
2832-
}
2833-
if (k < system->nodes[GPU].count)
2834-
system->nodes[GPU].nodes[k].paths[NET][i].type = PATH_PXB;
2835-
}
2836-
}
2837-
}
2834+
if (!graph->nChannels) {
2835+
// Fall back to tree from ringBase
2836+
NCCLCHECK(parseGraph(ringBase != nullptr ? ringBase : romeTopoModels[i].ringBase, system, graph, rdm, nnets > 1 ? n : NULL, 0));
28382837
}
2838+
applyNetOverride(system, romeTopoModels[i].options);
28392839
break;
28402840
}
28412841
return ncclSuccess;

0 commit comments

Comments
 (0)