Skip to content

Commit 3b42df3

Browse files
committed
fix (experimental): choose port based on job_id to prevent clashes on non-exclusive nodes
1 parent b27fe1e commit 3b42df3

File tree

1 file changed

+1
-0
lines changed

1 file changed

+1
-0
lines changed

dmlcloud/core/distributed.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -434,6 +434,7 @@ def _init_process_group_slurm(port=DEFAULT_PORT, **kwargs):
434434
_WorkerInfo.LOCAL_WORLD_SIZE = tasks_per_node[_WorkerInfo.NODE_ID]
435435

436436
ip = os.environ['SLURM_SRUN_COMM_HOST']
437+
port += int(os.environ['SLURM_JOB_ID']) % 7879
437438

438439
dist.init_process_group(
439440
init_method=f'tcp://{ip}:{port}',

0 commit comments

Comments
 (0)