From 47d7345b58ab4a17dc05b92af8e764b5650aca61 Mon Sep 17 00:00:00 2001 From: Raul Akhmetshin Date: Wed, 15 Apr 2026 21:21:44 +0300 Subject: [PATCH 1/2] UCT/IB/MLX5/GDAKI: Added knob to control retaining inactive ctx. --- src/uct/ib/base/ib_md.c | 6 +++ src/uct/ib/base/ib_md.h | 2 + src/uct/ib/mlx5/gdaki/gdaki.c | 81 ++++++++++++++++++++--------------- 3 files changed, 55 insertions(+), 34 deletions(-) diff --git a/src/uct/ib/base/ib_md.c b/src/uct/ib/base/ib_md.c index 793bf723457..405dffb60e2 100644 --- a/src/uct/ib/base/ib_md.c +++ b/src/uct/ib/base/ib_md.c @@ -129,6 +129,12 @@ ucs_config_field_t uct_ib_md_config_table[] = { "Enable DMA-BUF in GDA.", ucs_offsetof(uct_ib_md_config_t, ext.gda_dmabuf_enable), UCS_CONFIG_TYPE_TERNARY}, + {"GDA_RETAIN_INACTIVE_CTX", "n", + "Retain and use an inactive CUDA primary context to query device " + "capabilities.", + ucs_offsetof(uct_ib_md_config_t, ext.gda_retain_inactive_ctx), + UCS_CONFIG_TYPE_BOOL}, + {"PCI_BW", "", "Maximum effective data transfer rate of PCI bus connected to HCA\n", ucs_offsetof(uct_ib_md_config_t, pci_bw), UCS_CONFIG_TYPE_ARRAY(pci_bw)}, diff --git a/src/uct/ib/base/ib_md.h b/src/uct/ib/base/ib_md.h index b7ea5d0eb66..40c37a57ea3 100644 --- a/src/uct/ib/base/ib_md.h +++ b/src/uct/ib/base/ib_md.h @@ -114,6 +114,8 @@ typedef struct uct_ib_md_ext_config { int direct_nic; /**< Direct NIC with GPU functionality */ unsigned gda_max_hca_per_gpu; /**< Threshold of IB per GPU */ int gda_dmabuf_enable; /**< Enable DMA-BUF in GDA */ + /**< Retain and use an inactive CUDA primary context to query device capabilities */ + int gda_retain_inactive_ctx; } uct_ib_md_ext_config_t; diff --git a/src/uct/ib/mlx5/gdaki/gdaki.c b/src/uct/ib/mlx5/gdaki/gdaki.c index bfe5003416c..e0d05d5afa0 100644 --- a/src/uct/ib/mlx5/gdaki/gdaki.c +++ b/src/uct/ib/mlx5/gdaki/gdaki.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "gpunetio/common/doca_gpunetio_verbs_def.h" @@ -124,6 +125,33 @@ static void uct_rc_gdaki_calc_dev_ep_layout(size_t num_channels, size_t wq_len, *pgsz_bitmap_p = (max_page_size << 1) - 1; } +static CUdevice uct_gdaki_push_primary_ctx(int retain_inactive_ctx) +{ + CUdevice cuda_dev; + ucs_status_t status; + + status = uct_cuda_ctx_primary_push_first_active(&cuda_dev); + if (status == UCS_OK) { + return cuda_dev; + } + + if ((status != UCS_ERR_NO_DEVICE) || !retain_inactive_ctx) { + return CU_DEVICE_INVALID; + } + + status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGet(&cuda_dev, 0)); + if (status != UCS_OK) { + return CU_DEVICE_INVALID; + } + + status = uct_cuda_ctx_primary_push(cuda_dev, 1, UCS_LOG_LEVEL_ERROR); + if (status != UCS_OK) { + return CU_DEVICE_INVALID; + } + + return cuda_dev; +} + static int uct_gdaki_check_umem_dmabuf(const uct_ib_md_t *md) { ucs_status_t status = UCS_ERR_UNSUPPORTED; @@ -132,21 +160,16 @@ static int uct_gdaki_check_umem_dmabuf(const uct_ib_md_t *md) struct mlx5dv_devx_umem *umem; uct_cuda_copy_md_dmabuf_t dmabuf; CUdeviceptr buff; - CUcontext cuda_ctx; + CUdevice cuda_dev; - status = UCT_CUDADRV_FUNC_LOG_ERR(cuDevicePrimaryCtxRetain(&cuda_ctx, 0)); - if (status != UCS_OK) { + cuda_dev = uct_gdaki_push_primary_ctx(md->config.gda_retain_inactive_ctx); + if (cuda_dev == CU_DEVICE_INVALID) { return 0; } - status = UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPushCurrent(cuda_ctx)); - if (status != UCS_OK) { - goto out_ctx_release; - } - status = UCT_CUDADRV_FUNC_LOG_ERR(cuMemAlloc(&buff, 1)); if (status != UCS_OK) { - goto out_ctx_pop; + goto out_ctx_pop_and_release; } dmabuf = uct_cuda_copy_md_get_dmabuf((void*)buff, 1, @@ -169,10 +192,8 @@ static int uct_gdaki_check_umem_dmabuf(const uct_ib_md_t *md) out_free: ucs_close_fd(&dmabuf.fd); cuMemFree(buff); -out_ctx_pop: - UCT_CUDADRV_FUNC_LOG_WARN(cuCtxPopCurrent(NULL)); -out_ctx_release: - UCT_CUDADRV_FUNC_LOG_WARN(cuDevicePrimaryCtxRelease(0)); +out_ctx_pop_and_release: + uct_cuda_ctx_primary_pop_and_release(cuda_dev); #endif return status == UCS_OK; @@ -1134,12 +1155,11 @@ static UCS_CLASS_DEFINE_NEW_FUNC(uct_rc_gdaki_iface_t, uct_iface_t, uct_md_h, static UCS_CLASS_DEFINE_DELETE_FUNC(uct_rc_gdaki_iface_t, uct_iface_t); -static ucs_status_t -uct_gdaki_md_check_uar(uct_ib_mlx5_md_t *md, CUdevice cuda_dev) +static ucs_status_t uct_gdaki_md_check_uar(uct_ib_mlx5_md_t *md) { struct mlx5dv_devx_uar *uar; ucs_status_t status; - CUcontext cuda_ctx; + CUdevice cuda_dev; unsigned flags; status = uct_ib_mlx5_devx_alloc_uar(md, 0, &uar); @@ -1147,17 +1167,12 @@ uct_gdaki_md_check_uar(uct_ib_mlx5_md_t *md, CUdevice cuda_dev) goto out; } - status = UCT_CUDADRV_FUNC_LOG_ERR( - cuDevicePrimaryCtxRetain(&cuda_ctx, cuda_dev)); - if (status != UCS_OK) { + cuda_dev = uct_gdaki_push_primary_ctx( + md->super.config.gda_retain_inactive_ctx); + if (cuda_dev == CU_DEVICE_INVALID) { goto out_free_uar; } - status = UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPushCurrent(cuda_ctx)); - if (status != UCS_OK) { - goto out_ctx_release; - } - flags = CU_MEMHOSTREGISTER_PORTABLE | CU_MEMHOSTREGISTER_DEVICEMAP | CU_MEMHOSTREGISTER_IOMEMORY; status = UCT_CUDADRV_FUNC_LOG_DEBUG( @@ -1166,9 +1181,7 @@ uct_gdaki_md_check_uar(uct_ib_mlx5_md_t *md, CUdevice cuda_dev) UCT_CUDADRV_FUNC_LOG_DEBUG(cuMemHostUnregister(uar->reg_addr)); } - UCT_CUDADRV_FUNC_LOG_WARN(cuCtxPopCurrent(NULL)); -out_ctx_release: - UCT_CUDADRV_FUNC_LOG_WARN(cuDevicePrimaryCtxRelease(cuda_dev)); + uct_cuda_ctx_primary_pop_and_release(cuda_dev); out_free_uar: mlx5dv_devx_free_uar(uar); out: @@ -1196,7 +1209,7 @@ static int uct_gdaki_is_peermem_loaded(const uct_ib_md_t *md) return peermem_loaded; } -static int uct_gdaki_is_uar_supported(uct_ib_mlx5_md_t *md, CUdevice cu_device) +static int uct_gdaki_is_uar_supported(uct_ib_mlx5_md_t *md) { /** * Save the result of UAR support in a global flag to avoid the overhead of @@ -1209,7 +1222,7 @@ static int uct_gdaki_is_uar_supported(uct_ib_mlx5_md_t *md, CUdevice cu_device) return uar_supported; } - uar_supported = (uct_gdaki_md_check_uar(md, cu_device) == UCS_OK); + uar_supported = (uct_gdaki_md_check_uar(md) == UCS_OK); if (uar_supported == 0) { ucs_diag("GDAKI not supported, please add NVreg_RegistryDwords=" "\"PeerMappingOverride=1;\" option for nvidia kernel driver"); @@ -1437,6 +1450,11 @@ uct_gdaki_query_tl_devices(uct_md_h tl_md, goto out; } + if (!uct_gdaki_is_uar_supported(ib_mlx5_md)) { + status = UCS_ERR_NO_DEVICE; + goto err; + } + num_tl_devices = 0; ucs_for_each_bit(i, ibdesc->cuda_map) { status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGet(&device, i)); @@ -1444,11 +1462,6 @@ uct_gdaki_query_tl_devices(uct_md_h tl_md, goto err; } - if (!uct_gdaki_is_uar_supported(ib_mlx5_md, device)) { - status = UCS_ERR_NO_DEVICE; - goto err; - } - dev = uct_cuda_get_sys_dev(device); snprintf(tl_devices[num_tl_devices].name, From 9aea8e3599cc377b94ae20de822f2132b0469204 Mon Sep 17 00:00:00 2001 From: Raul Akhmetshin Date: Thu, 16 Apr 2026 16:15:05 +0300 Subject: [PATCH 2/2] UCT/IB/MLX5/GDAKI: Grouped checks required ctx to a single method. --- src/uct/ib/mlx5/gdaki/gdaki.c | 163 ++++++++++++++++++---------------- 1 file changed, 86 insertions(+), 77 deletions(-) diff --git a/src/uct/ib/mlx5/gdaki/gdaki.c b/src/uct/ib/mlx5/gdaki/gdaki.c index e0d05d5afa0..fecd0a41fab 100644 --- a/src/uct/ib/mlx5/gdaki/gdaki.c +++ b/src/uct/ib/mlx5/gdaki/gdaki.c @@ -125,53 +125,19 @@ static void uct_rc_gdaki_calc_dev_ep_layout(size_t num_channels, size_t wq_len, *pgsz_bitmap_p = (max_page_size << 1) - 1; } -static CUdevice uct_gdaki_push_primary_ctx(int retain_inactive_ctx) -{ - CUdevice cuda_dev; - ucs_status_t status; - - status = uct_cuda_ctx_primary_push_first_active(&cuda_dev); - if (status == UCS_OK) { - return cuda_dev; - } - - if ((status != UCS_ERR_NO_DEVICE) || !retain_inactive_ctx) { - return CU_DEVICE_INVALID; - } - - status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGet(&cuda_dev, 0)); - if (status != UCS_OK) { - return CU_DEVICE_INVALID; - } - - status = uct_cuda_ctx_primary_push(cuda_dev, 1, UCS_LOG_LEVEL_ERROR); - if (status != UCS_OK) { - return CU_DEVICE_INVALID; - } - - return cuda_dev; -} - static int uct_gdaki_check_umem_dmabuf(const uct_ib_md_t *md) { - ucs_status_t status = UCS_ERR_UNSUPPORTED; + ucs_status_t ret = 0; #if HAVE_DECL_MLX5DV_UMEM_MASK_DMABUF struct mlx5dv_devx_umem_in umem_in = {}; struct mlx5dv_devx_umem *umem; uct_cuda_copy_md_dmabuf_t dmabuf; CUdeviceptr buff; - CUdevice cuda_dev; - cuda_dev = uct_gdaki_push_primary_ctx(md->config.gda_retain_inactive_ctx); - if (cuda_dev == CU_DEVICE_INVALID) { + if (UCT_CUDADRV_FUNC_LOG_ERR(cuMemAlloc(&buff, 1)) != UCS_OK) { return 0; } - status = UCT_CUDADRV_FUNC_LOG_ERR(cuMemAlloc(&buff, 1)); - if (status != UCS_OK) { - goto out_ctx_pop_and_release; - } - dmabuf = uct_cuda_copy_md_get_dmabuf((void*)buff, 1, UCS_SYS_DEVICE_ID_UNKNOWN); @@ -183,20 +149,18 @@ static int uct_gdaki_check_umem_dmabuf(const uct_ib_md_t *md) umem_in.dmabuf_fd = dmabuf.fd; umem = mlx5dv_devx_umem_reg_ex(md->dev.ibv_context, &umem_in); - if (umem == NULL) { - status = UCS_ERR_NO_MEMORY; - goto out_free; + if (umem != NULL) { + mlx5dv_devx_umem_dereg(umem); + ret = 1; + } else { + ret = 0; } - mlx5dv_devx_umem_dereg(umem); -out_free: ucs_close_fd(&dmabuf.fd); - cuMemFree(buff); -out_ctx_pop_and_release: - uct_cuda_ctx_primary_pop_and_release(cuda_dev); + (void)UCT_CUDADRV_FUNC_LOG_WARN(cuMemFree(buff)); #endif - return status == UCS_OK; + return ret; } static int uct_gdaki_is_dmabuf_supported(const uct_ib_md_t *md) @@ -1159,18 +1123,11 @@ static ucs_status_t uct_gdaki_md_check_uar(uct_ib_mlx5_md_t *md) { struct mlx5dv_devx_uar *uar; ucs_status_t status; - CUdevice cuda_dev; unsigned flags; status = uct_ib_mlx5_devx_alloc_uar(md, 0, &uar); if (status != UCS_OK) { - goto out; - } - - cuda_dev = uct_gdaki_push_primary_ctx( - md->super.config.gda_retain_inactive_ctx); - if (cuda_dev == CU_DEVICE_INVALID) { - goto out_free_uar; + return status; } flags = CU_MEMHOSTREGISTER_PORTABLE | CU_MEMHOSTREGISTER_DEVICEMAP | @@ -1178,13 +1135,10 @@ static ucs_status_t uct_gdaki_md_check_uar(uct_ib_mlx5_md_t *md) status = UCT_CUDADRV_FUNC_LOG_DEBUG( cuMemHostRegister(uar->reg_addr, UCT_IB_MLX5_BF_REG_SIZE, flags)); if (status == UCS_OK) { - UCT_CUDADRV_FUNC_LOG_DEBUG(cuMemHostUnregister(uar->reg_addr)); + UCT_CUDADRV_FUNC_LOG_WARN(cuMemHostUnregister(uar->reg_addr)); } - uct_cuda_ctx_primary_pop_and_release(cuda_dev); -out_free_uar: mlx5dv_devx_free_uar(uar); -out: return status; } @@ -1382,6 +1336,80 @@ uct_gdaki_dev_matrix_init(const uct_ib_md_t *ib_md, size_t *dmat_length_p) return dmat; } +static CUdevice uct_gdaki_push_primary_ctx(int retain_inactive_ctx) +{ + CUdevice cuda_dev; + ucs_status_t status; + + status = uct_cuda_ctx_primary_push_first_active(&cuda_dev); + if (status == UCS_OK) { + return cuda_dev; + } + + if ((status != UCS_ERR_NO_DEVICE) || !retain_inactive_ctx) { + if (status == UCS_ERR_NO_DEVICE) { + ucs_diag("no active primary CUDA context on any device. Please set " + "UCX_IB_GDA_RETAIN_INACTIVE_CTX=yes to retain inactive " + "context."); + } + return CU_DEVICE_INVALID; + } + + status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGet(&cuda_dev, 0)); + if (status != UCS_OK) { + return CU_DEVICE_INVALID; + } + + status = uct_cuda_ctx_primary_push(cuda_dev, 1, UCS_LOG_LEVEL_ERROR); + if (status != UCS_OK) { + return CU_DEVICE_INVALID; + } + + return cuda_dev; +} + +static int +uct_gdaki_check_cuda_ctx_dependent_features(uct_ib_mlx5_md_t *ib_mlx5_md) +{ + uct_ib_md_t *ib_md = &ib_mlx5_md->super; + CUdevice cuda_dev; + char dmabuf_str[8]; + int ret; + + cuda_dev = uct_gdaki_push_primary_ctx(ib_md->config.gda_retain_inactive_ctx); + if (cuda_dev == CU_DEVICE_INVALID) { + return 0; + } + + if ((ib_md->config.gda_dmabuf_enable != UCS_NO) && + uct_gdaki_is_dmabuf_supported(ib_md)) { + ib_mlx5_md->flags |= UCT_IB_MLX5_MD_FLAG_REG_DMABUF_UMEM; + ucs_debug("%s: using dmabuf for gda transport", + uct_ib_device_name(&ib_md->dev)); + } else if ((ib_md->config.gda_dmabuf_enable != UCS_YES) && + uct_gdaki_is_peermem_loaded(ib_md)) { + ucs_debug("%s: using peermem for gda transport", + uct_ib_device_name(&ib_md->dev)); + } else { + ucs_config_sprintf_ternary_auto(dmabuf_str, sizeof(dmabuf_str), + &ib_md->config.gda_dmabuf_enable, NULL); + ucs_diag("%s: GPU-direct RDMA is not available (GDA_DMABUF_ENABLE=%s)", + uct_ib_device_name(&ib_md->dev), dmabuf_str); + ret = 0; + goto out; + } + + if (uct_gdaki_is_uar_supported(ib_mlx5_md)) { + ret = 1; + } else { + ret = 0; + } + +out: + uct_cuda_ctx_primary_pop_and_release(cuda_dev); + return ret; +} + static ucs_status_t uct_gdaki_query_tl_devices(uct_md_h tl_md, uct_tl_device_resource_t **tl_devices_p, @@ -1399,7 +1427,6 @@ uct_gdaki_query_tl_devices(uct_md_h tl_md, ucs_sys_device_t dev; int i; uct_gdaki_dev_matrix_elem_t *ibdesc; - char dmabuf_str[8]; UCS_INIT_ONCE(&dmat_once) { dmat = uct_gdaki_dev_matrix_init(ib_md, &dmat_length); @@ -1410,20 +1437,7 @@ uct_gdaki_query_tl_devices(uct_md_h tl_md, goto out; } - if ((ib_md->config.gda_dmabuf_enable != UCS_NO) && - uct_gdaki_is_dmabuf_supported(ib_md)) { - ib_mlx5_md->flags |= UCT_IB_MLX5_MD_FLAG_REG_DMABUF_UMEM; - ucs_debug("%s: using dmabuf for gda transport", - uct_ib_device_name(&ib_md->dev)); - } else if ((ib_md->config.gda_dmabuf_enable != UCS_YES) && - uct_gdaki_is_peermem_loaded(ib_md)) { - ucs_debug("%s: using peermem for gda transport", - uct_ib_device_name(&ib_md->dev)); - } else { - ucs_config_sprintf_ternary_auto(dmabuf_str, sizeof(dmabuf_str), - &ib_md->config.gda_dmabuf_enable, NULL); - ucs_diag("%s: GPU-direct RDMA is not available (GDA_DMABUF_ENABLE=%s)", - uct_ib_device_name(&ib_md->dev), dmabuf_str); + if (!uct_gdaki_check_cuda_ctx_dependent_features(ib_mlx5_md)) { status = UCS_ERR_NO_DEVICE; goto out; } @@ -1450,11 +1464,6 @@ uct_gdaki_query_tl_devices(uct_md_h tl_md, goto out; } - if (!uct_gdaki_is_uar_supported(ib_mlx5_md)) { - status = UCS_ERR_NO_DEVICE; - goto err; - } - num_tl_devices = 0; ucs_for_each_bit(i, ibdesc->cuda_map) { status = UCT_CUDADRV_FUNC_LOG_ERR(cuDeviceGet(&device, i));