diff --git a/src/uct/ib/mlx5/gdaki/gdaki.c b/src/uct/ib/mlx5/gdaki/gdaki.c index bfe5003416c..e5bad6b8eb0 100644 --- a/src/uct/ib/mlx5/gdaki/gdaki.c +++ b/src/uct/ib/mlx5/gdaki/gdaki.c @@ -205,45 +205,39 @@ static int uct_gdaki_is_dmabuf_supported(const uct_ib_md_t *md) return dmabuf_supported; } -static uct_cuda_copy_md_dmabuf_t uct_rc_gdaki_get_dmabuf(const uct_ib_md_t *md, - const void *address, - size_t length) -{ - uct_ib_mlx5_md_t *ib_mlx5_md = ucs_derived_of(md, uct_ib_mlx5_md_t); - uct_cuda_copy_md_dmabuf_t dmabuf = { - .fd = UCT_DMABUF_FD_INVALID, - .offset = 0 - }; - - if (ib_mlx5_md->flags & UCT_IB_MLX5_MD_FLAG_REG_DMABUF_UMEM) { - return uct_cuda_copy_md_get_dmabuf(address, length, - UCS_SYS_DEVICE_ID_UNKNOWN); - } - - return dmabuf; -} - static struct mlx5dv_devx_umem * uct_rc_gdaki_umem_reg(const uct_ib_md_t *md, struct ibv_context *ibv_context, void *address, size_t length, uint64_t pgsz_bitmap) { #if HAVE_DECL_MLX5DV_UMEM_MASK_DMABUF + uct_ib_mlx5_md_t *ib_mlx5_md = ucs_derived_of(md, uct_ib_mlx5_md_t); struct mlx5dv_devx_umem_in umem_in = {}; - uct_cuda_copy_md_dmabuf_t dmabuf; + uct_cuda_copy_md_dmabuf_t dmabuf = { + .fd = UCT_DMABUF_FD_INVALID, + .offset = 0 + }; struct mlx5dv_devx_umem *umem; umem_in.addr = address; umem_in.size = length; umem_in.access = IBV_ACCESS_LOCAL_WRITE; umem_in.pgsz_bitmap = pgsz_bitmap; - dmabuf = uct_rc_gdaki_get_dmabuf(md, address, length); - if (dmabuf.fd != UCT_DMABUF_FD_INVALID) { - umem_in.comp_mask = MLX5DV_UMEM_MASK_DMABUF; - umem_in.dmabuf_fd = dmabuf.fd; + + if (ib_mlx5_md->flags & UCT_IB_MLX5_MD_FLAG_REG_DMABUF_UMEM) { + dmabuf = uct_cuda_copy_md_get_dmabuf(address, length, + UCS_SYS_DEVICE_ID_UNKNOWN); + if (dmabuf.fd != UCT_DMABUF_FD_INVALID) { + umem_in.comp_mask = MLX5DV_UMEM_MASK_DMABUF; + umem_in.dmabuf_fd = dmabuf.fd; + } } umem = mlx5dv_devx_umem_reg_ex(ibv_context, &umem_in); - ucs_close_fd(&dmabuf.fd); + + if (dmabuf.fd != UCT_DMABUF_FD_INVALID) { + ucs_close_fd(&dmabuf.fd); + } + return umem; #else return mlx5dv_devx_umem_reg(ibv_context, address, length, @@ -852,6 +846,9 @@ uct_rc_gdaki_ep_get_device_ep(uct_ep_h tl_ep, uct_device_ep_h *device_ep_p) iface->num_channels * UCT_GDAKI_DEV_QP_SIZE; dev_ep = ucs_calloc(1, dev_ep_host_size, "dev_ep"); if (dev_ep == NULL) { + ucs_error("failed to allocate dev_ep on host buffer size=%zu ep=%p " + "iface=%p", + dev_ep_host_size, ep, iface); status = UCS_ERR_NO_MEMORY; goto out_ctx; } @@ -900,7 +897,7 @@ uct_rc_gdaki_ep_get_device_ep(uct_ep_h tl_ep, uct_device_ep_h *device_ep_p) cuMemcpyHtoD((CUdeviceptr)ep->channel_block->gpu_ptr, dev_ep, dev_ep_host_size)); if (status != UCS_OK) { - goto out_free; + goto out_unreg; } ucs_free(dev_ep); @@ -979,27 +976,6 @@ static uct_iface_ops_t uct_rc_gdaki_iface_tl_ops = { ucs_empty_function_return_unsupported, }; -static ucs_status_t uct_rc_gdaki_reg_mr(const uct_ib_md_t *md, void *address, - size_t length, struct ibv_mr **mr_p) -{ - uct_md_mem_reg_params_t params; - uct_cuda_copy_md_dmabuf_t dmabuf; - ucs_status_t status; - - params.field_mask = UCT_MD_MEM_REG_FIELD_FLAGS | - UCT_MD_MEM_REG_FIELD_DMABUF_FD | - UCT_MD_MEM_REG_FIELD_DMABUF_OFFSET; - params.flags = 0; - dmabuf = uct_rc_gdaki_get_dmabuf(md, address, length); - params.dmabuf_fd = dmabuf.fd; - params.dmabuf_offset = dmabuf.offset; - - status = uct_ib_reg_mr(md, address, length, ¶ms, - UCT_IB_MEM_ACCESS_FLAGS, NULL, mr_p); - ucs_close_fd(&dmabuf.fd); - return status; -} - static UCS_CLASS_INIT_FUNC(uct_rc_gdaki_iface_t, uct_md_h tl_md, uct_worker_h worker, const uct_iface_params_t *params, @@ -1009,11 +985,13 @@ static UCS_CLASS_INIT_FUNC(uct_rc_gdaki_iface_t, uct_md_h tl_md, ucs_derived_of(tl_config, uct_rc_gdaki_iface_config_t); uct_ib_mlx5_md_t *md = ucs_derived_of(tl_md, uct_ib_mlx5_md_t); uct_ib_iface_init_attr_t init_attr = {}; + uct_md_mem_reg_params_t reg_params = {}; UCS_STRING_BUFFER_ONSTACK(strb, 64); char *gpu_name, *ib_name; char pci_addr[UCS_SYS_BDF_NAME_MAX]; ucs_status_t status; int cuda_id; + int ret; if (config->num_channels > UINT8_MAX + 1) { ucs_error("num_channels exceeds maximum value of 256"); @@ -1073,16 +1051,20 @@ static UCS_CLASS_INIT_FUNC(uct_rc_gdaki_iface_t, uct_md_h tl_md, goto err_ctx_release; } - status = uct_rc_gdaki_alloc(sizeof(uint64_t), sizeof(uint64_t), - (void**)&self->atomic_buff, &self->atomic_raw); - if (status != UCS_OK) { + ret = ucs_posix_memalign((void**)&self->atomic_buff, + UCS_SYS_CACHE_LINE_SIZE, sizeof(uint64_t), + "gdaki_atomic_buf"); + if (ret != 0) { + ucs_error("failed to allocate AMO buffer"); + status = UCS_ERR_NO_MEMORY; goto err_ctx; } - status = uct_rc_gdaki_reg_mr(&md->super, self->atomic_buff, - sizeof(uint64_t), &self->atomic_mr); + status = uct_ib_reg_mr(&md->super, self->atomic_buff, sizeof(uint64_t), + ®_params, UCT_IB_MEM_ACCESS_FLAGS, NULL, + &self->atomic_mr); if (status != UCS_OK) { - goto err_atomic; + goto err_atomic_buff; } if (pthread_mutex_init(&self->ep_init_lock, NULL) != 0) { @@ -1104,8 +1086,8 @@ static UCS_CLASS_INIT_FUNC(uct_rc_gdaki_iface_t, uct_md_h tl_md, pthread_mutex_destroy(&self->ep_init_lock); err_lock: ibv_dereg_mr(self->atomic_mr); -err_atomic: - cuMemFree(self->atomic_raw); +err_atomic_buff: + ucs_free(self->atomic_buff); err_ctx: (void)UCT_CUDADRV_FUNC_LOG_WARN(cuCtxPopCurrent(NULL)); err_ctx_release: @@ -1117,11 +1099,11 @@ static UCS_CLASS_CLEANUP_FUNC(uct_rc_gdaki_iface_t) { pthread_mutex_destroy(&self->ep_init_lock); ibv_dereg_mr(self->atomic_mr); + ucs_free(self->atomic_buff); (void)UCT_CUDADRV_FUNC_LOG_ERR(cuCtxPushCurrent(self->cuda_ctx)); if (self->ep_alloc_mode == UCT_RC_GDAKI_EP_ALLOC_MODE_POOL) { uct_rc_gdaki_iface_cleanup_channel_pool(self); } - cuMemFree(self->atomic_raw); (void)UCT_CUDADRV_FUNC_LOG_WARN(cuCtxPopCurrent(NULL)); (void)UCT_CUDADRV_FUNC_LOG_WARN(cuDevicePrimaryCtxRelease(self->cuda_dev)); } diff --git a/src/uct/ib/mlx5/gdaki/gdaki.h b/src/uct/ib/mlx5/gdaki/gdaki.h index 21818493f3c..1b1034dd166 100644 --- a/src/uct/ib/mlx5/gdaki/gdaki.h +++ b/src/uct/ib/mlx5/gdaki/gdaki.h @@ -38,7 +38,6 @@ typedef struct uct_rc_gdaki_iface { uct_rc_mlx5_iface_common_t super; CUdevice cuda_dev; struct ibv_mr *atomic_mr; - CUdeviceptr atomic_raw; uint64_t *atomic_buff; CUcontext cuda_ctx; unsigned num_channels;