Skip to content

Commit 904d298

Browse files
authored
retain global pointers to previous default rmm memory resources (#995)
This is needed to avoid race condition segfaults with SAM when SAM headroom is reduced from its initial larger value during data loading to a smaller value during computations. --------- Signed-off-by: Erik Ordentlich <[email protected]>
1 parent 7e311c6 commit 904d298

6 files changed

Lines changed: 21 additions & 11 deletions

File tree

python/src/spark_rapids_ml/classification.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1071,7 +1071,6 @@ def _single_fit(init_parameters: Dict[str, Any]) -> Dict[str, Any]:
10711071
cuda_managed_mem_enabled,
10721072
cuda_system_mem_enabled,
10731073
cuda_system_mem_headroom,
1074-
force_sam_headroom=True,
10751074
)
10761075

10771076
logistic_regression.fit(

python/src/spark_rapids_ml/clustering.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -391,7 +391,6 @@ def _cuml_fit(
391391
cuda_managed_mem_enabled,
392392
cuda_system_mem_enabled,
393393
cuda_system_mem_headroom,
394-
force_sam_headroom=True,
395394
)
396395

397396
kmeans_object._fit(
@@ -997,7 +996,6 @@ def _cuml_fit(
997996
cuda_managed_mem_enabled,
998997
cuda_system_mem_enabled,
999998
cuda_system_mem_headroom,
1000-
force_sam_headroom=True,
1001999
)
10021000

10031001
# Set out_dtype tp 64bit to get larger indexType in cuML for avoiding overflow

python/src/spark_rapids_ml/knn.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -758,7 +758,6 @@ async def do_allGather() -> List[str]:
758758
cuda_managed_mem_enabled,
759759
cuda_system_mem_enabled,
760760
cuda_system_mem_headroom,
761-
force_sam_headroom=True,
762761
)
763762

764763
res_tuple: Tuple[List[np.ndarray], List[np.ndarray]] = nn_object.kneighbors(

python/src/spark_rapids_ml/tree.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,6 @@ def _single_fit(rf: cuRf) -> Dict[str, Any]:
407407
cuda_managed_mem_enabled,
408408
cuda_system_mem_enabled,
409409
cuda_system_mem_headroom,
410-
force_sam_headroom=True,
411410
)
412411

413412
# Fit a random forest model on the dataset (X, y)

python/src/spark_rapids_ml/umap.py

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1044,7 +1044,6 @@ def _cuml_fit(
10441044
cuda_managed_mem_enabled,
10451045
cuda_system_mem_enabled,
10461046
cuda_system_mem_headroom,
1047-
force_sam_headroom=True,
10481047
)
10491048

10501049
umap_model = umap_object.fit(concated, y=labels)
@@ -1054,7 +1053,6 @@ def _cuml_fit(
10541053
cuda_managed_mem_enabled,
10551054
cuda_system_mem_enabled,
10561055
cuda_system_mem_headroom,
1057-
force_sam_headroom=True,
10581056
)
10591057

10601058
# Call unsupervised fit

python/src/spark_rapids_ml/utils.py

Lines changed: 21 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -163,17 +163,29 @@ def _get_gpu_id(task_context: TaskContext) -> int:
163163
return gpu_id
164164

165165

166+
# When changing default rmm memory resources we retain the old ones
167+
# in this global array singleton to so that any (C++) allocations using them can
168+
# invoke the corresponding deallocate methods. They will get cleaned up only when
169+
# the process exits. This avoids a segfault in the case of creating a new
170+
# SAM resource with a smaller headroom.
171+
_old_memory_resources = []
172+
173+
# keep track of last headroom to check if new sam mr is needed.
174+
_last_sam_headroom_size = None
175+
176+
166177
def _configure_memory_resource(
167178
uvm_enabled: bool = False,
168179
sam_enabled: bool = False,
169180
sam_headroom: Optional[int] = None,
170-
force_sam_headroom: bool = False,
171181
) -> None:
172182
import cupy as cp
173183
import rmm
174184
from cuda.bindings import runtime
175185
from rmm.allocators.cupy import rmm_cupy_allocator
176186

187+
global _last_sam_headroom_size
188+
177189
_SYSTEM_MEMORY_SUPPORTED = rmm._cuda.gpu.getDeviceAttribute(
178190
runtime.cudaDeviceAttr.cudaDevAttrPageableMemoryAccess,
179191
rmm._cuda.gpu.getDevice(),
@@ -193,19 +205,24 @@ def _configure_memory_resource(
193205
if not type(rmm.mr.get_current_device_resource()) == type(
194206
rmm.mr.SystemMemoryResource()
195207
):
208+
_old_memory_resources.append(rmm.mr.get_current_device_resource())
209+
_last_sam_headroom_size = None
196210
mr = rmm.mr.SystemMemoryResource()
197211
rmm.mr.set_current_device_resource(mr)
198212
elif sam_enabled and sam_headroom is not None:
199-
if force_sam_headroom or not type(rmm.mr.get_current_device_resource()) == type(
200-
rmm.mr.SamHeadroomMemoryResource(headroom=sam_headroom)
201-
):
213+
if sam_headroom != _last_sam_headroom_size or not type(
214+
rmm.mr.get_current_device_resource()
215+
) == type(rmm.mr.SamHeadroomMemoryResource(headroom=sam_headroom)):
216+
_old_memory_resources.append(rmm.mr.get_current_device_resource())
217+
_last_sam_headroom_size = sam_headroom
202218
mr = rmm.mr.SamHeadroomMemoryResource(headroom=sam_headroom)
203219
rmm.mr.set_current_device_resource(mr)
204220

205221
if uvm_enabled:
206222
if not type(rmm.mr.get_current_device_resource()) == type(
207223
rmm.mr.ManagedMemoryResource()
208224
):
225+
_old_memory_resources.append(rmm.mr.get_current_device_resource())
209226
rmm.mr.set_current_device_resource(rmm.mr.ManagedMemoryResource())
210227

211228
if sam_enabled or uvm_enabled:

0 commit comments

Comments
 (0)