Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .ci/scripts/env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,4 @@ export CONFIGS_DIR="${SRC_ROOT_DIR}/.ci/configs"

# DLRM MASTER_PORT
export MASTER_PORT="12346"
export DOCKER_SSH_PORT="12345"
export DOCKER_SSH_PORT="${DOCKER_SSH_PORT:-12345}"
6 changes: 3 additions & 3 deletions .ci/scripts/run_dlrm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ mpirun \
-np $NP \
--hostfile ${HOSTFILE} \
--map-by node \
--mca plm_rsh_args '-p 12345' \
--mca plm_rsh_args "-p ${DOCKER_SSH_PORT}" \
-x PATH \
-x LD_LIBRARY_PATH \
hostname
Expand All @@ -42,7 +42,7 @@ mpirun \
-np $NP \
--hostfile ${HOSTFILE} \
--map-by node \
--mca plm_rsh_args '-p 12345' \
--mca plm_rsh_args "-p ${DOCKER_SSH_PORT}" \
-x PATH \
-x LD_LIBRARY_PATH \
cat /proc/1/cgroup
Expand All @@ -52,7 +52,7 @@ mpirun \
-np $NP \
--hostfile ${HOSTFILE} \
--map-by node \
--mca plm_rsh_args '-p 12345' \
--mca plm_rsh_args "-p ${DOCKER_SSH_PORT}" \
-x PATH \
-x LD_LIBRARY_PATH \
-x MASTER_ADDR \
Expand Down
2 changes: 1 addition & 1 deletion .ci/scripts/run_tests_ucc_mpi.sh
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ function mpi_params {
fi
echo "-np $((nnodes*ppn)) --oversubscribe --hostfile ${HOSTFILE} \
--map-by ppr:$ppn:node --bind-to socket \
-x PATH -x LD_LIBRARY_PATH --mca opal_common_ucx_opal_mem_hooks 1 --mca plm_rsh_args -p12345 \
-x PATH -x LD_LIBRARY_PATH --mca opal_common_ucx_opal_mem_hooks 1 --mca plm_rsh_args -p${DOCKER_SSH_PORT} \
--mca coll ^ucc,hcoll \
-x UCX_NET_DEVICES=$DEV:1"
}
Expand Down
18 changes: 18 additions & 0 deletions .ci/scripts/run_tests_ucc_nvls.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#!/bin/bash -eEx


SCRIPT_DIR="$(
cd "$(dirname "$0")"
pwd -P
)"
cd "${SCRIPT_DIR}"
. "${SCRIPT_DIR}/env.sh"

export OMPI_MCA_coll=^hcoll
export OMPI_MCA_coll_ucc_enable=0
export UCC_TLS=cuda,ucp
export UCC_LOG_LEVEL=info
export UCC_TL_CUDA_NVLS_SM_COUNT=20
export UCC_TL_CUDA_TUNE=allreduce:cuda:@0

/opt/nvidia/bin/ucc/build/bin/ucc_perftest -c allreduce -F -m cuda -b 1k -e 32M -d bfloat16 -o sum
12 changes: 12 additions & 0 deletions .ci/scripts/run_tests_ucc_nvls_slurm.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
#!/bin/bash -eEx


SCRIPT_DIR="$(
cd "$(dirname "$0")"
pwd -P
)"
cd "${SCRIPT_DIR}"
. "${SCRIPT_DIR}/env.sh"

# slurm_command_prefix="scctl client connect --"
scctl client connect -- srun --jobid=${JOB_ID} --nodes=2 --mpi=pmix --cpu-bind=verbose --ntasks-per-node=1 --gpus-per-node=1 --container-image=${DOCKER_IMAGE_NAME} /opt/nvidia/src/ucc/.ci/scripts/run_tests_ucc_nvls.sh
Loading