-
Notifications
You must be signed in to change notification settings - Fork 296
CI: allow specifying custom driver versions in test matrix #2176
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
b1b6070
3e016b5
c0ca869
4a23b23
d33a928
00896dc
0d5f0e9
3dfaa84
701cf2f
a3f1573
c5fef92
f17dd7f
6412f4f
2b34f1f
8d8a9ef
d2c25eb
fa7940a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -85,8 +85,13 @@ jobs: | |
| # Read base matrix from YAML file for the specific architecture | ||
| TEST_MATRIX=$(yq -o json ".linux[\"${MATRIX_TYPE}\"] | map(select(.ARCH == \"${ARCH}\"))" ci/test-matrix.yml) | ||
|
|
||
| # Apply matrix filter and wrap in include structure | ||
| MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end') | ||
| # Apply matrix filter; reject custom DRIVER + FLAVOR=wsl (the | ||
| # in-container driver swap doesn't work under WSL); add a | ||
| # RUNNER_DRIVER field that maps any custom version back to | ||
| # 'latest' (the install script swaps the driver itself, so we | ||
| # need to land on the runner that ships with the most recent | ||
| # pre-installed driver); wrap in include structure. | ||
| MATRIX=$(echo "$TEST_MATRIX" | jq -c '${{ inputs.matrix_filter }} | if any(.[]; .DRIVER != "latest" and .DRIVER != "earliest" and .FLAVOR == "wsl") then "Error: custom DRIVER is not supported with FLAVOR=wsl\n" | halt_error(1) else . end | map(. + {RUNNER_DRIVER: (if .DRIVER == "latest" or .DRIVER == "earliest" then .DRIVER else "latest" end)}) | if (. | length) > 0 then {include: .} else "Error: Empty matrix\n" | halt_error(1) end') | ||
|
|
||
| echo "MATRIX=${MATRIX}" | tee --append "${GITHUB_OUTPUT}" | ||
|
|
||
|
|
@@ -101,21 +106,21 @@ jobs: | |
| strategy: | ||
| fail-fast: false | ||
| matrix: ${{ fromJSON(needs.compute-matrix.outputs.MATRIX) }} | ||
| runs-on: "${{ matrix.FLAVOR || 'linux' }}-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.DRIVER }}-${{ matrix.GPU_COUNT }}" | ||
| runs-on: "${{ matrix.FLAVOR || 'linux' }}-${{ matrix.ARCH }}-gpu-${{ matrix.GPU }}-${{ matrix.RUNNER_DRIVER }}-${{ matrix.GPU_COUNT }}" | ||
| # The build stage could fail but we want the CI to keep moving. | ||
| if: ${{ github.repository_owner == 'nvidia' && !cancelled() }} | ||
| # Our self-hosted runners require a container | ||
| # TODO: use a different (nvidia?) container | ||
| container: | ||
| options: -u root --security-opt seccomp=unconfined --shm-size 16g | ||
| # Custom-DRIVER rows need --privileged --pid=host so install_gpu_driver.sh | ||
| # can nsenter to the host for the install + refresh the toolkit bind mounts | ||
|
leofang marked this conversation as resolved.
|
||
| # back inside the container. Stock options for latest/earliest rows. | ||
| options: ${{ ((matrix.DRIVER == 'latest' || matrix.DRIVER == 'earliest') && '-u root --security-opt seccomp=unconfined --shm-size 16g') || '-u root --security-opt seccomp=unconfined --shm-size 16g --privileged --pid=host' }} | ||
| image: ubuntu:22.04 | ||
| env: | ||
| NVIDIA_VISIBLE_DEVICES: ${{ env.NVIDIA_VISIBLE_DEVICES }} | ||
| PIP_CACHE_DIR: "/tmp/pip-cache" | ||
| steps: | ||
| - name: Ensure GPU is working | ||
| run: nvidia-smi | ||
|
|
||
| - name: Checkout ${{ github.event.repository.name }} | ||
| uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 | ||
|
|
||
|
|
@@ -127,10 +132,22 @@ jobs: | |
| uses: ./.github/actions/install_unix_deps | ||
| continue-on-error: false | ||
| with: | ||
| # for artifact fetching, graphics libs, g++ required for cffi in example | ||
| dependencies: "jq wget libgl1 libegl1 g++" | ||
| # for artifact fetching, graphics libs, g++ required for cffi in | ||
| # example; util-linux for `nsenter` (custom-DRIVER rows re-exec | ||
| # install_gpu_driver.sh onto the host through nsenter) | ||
| dependencies: "jq wget libgl1 libegl1 g++ util-linux" | ||
| dependent_exes: "jq wget" | ||
|
|
||
| - name: Install GPU driver | ||
| if: ${{ matrix.DRIVER != 'latest' && matrix.DRIVER != 'earliest' }} | ||
| env: | ||
| DRIVER: ${{ matrix.DRIVER }} | ||
| GPU_TYPE: ${{ matrix.GPU }} | ||
|
Comment on lines
+143
to
+145
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need to handle the proprietary vs open kernel modules?
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. M.V.P. use proprietary (hard-coded in |
||
| run: ./ci/tools/install_gpu_driver.sh | ||
|
|
||
| - name: Ensure GPU is working | ||
| run: nvidia-smi | ||
|
|
||
| - name: Set environment variables | ||
| env: | ||
| BUILD_CUDA_VER: ${{ inputs.build-ctk-ver }} | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -13,7 +13,16 @@ | |
| # Windows entries also include DRIVER_MODE. | ||
| # | ||
| # Notes: | ||
| # - DRIVER accepts: | ||
| # * 'latest' - use the runner's pre-installed latest driver (no install step) | ||
| # * 'earliest' - use the runner's pre-installed earliest driver (no install step) | ||
| # * a version string (e.g. '580.65.06') | ||
| # - install that version via ci/tools/install_gpu_driver.sh (Linux) | ||
| # or ci/tools/install_gpu_driver.ps1 (Windows) at the start of the | ||
| # job. The matrix row is routed to the 'latest' runner image (the | ||
| # install scripts swap the driver themselves). | ||
| # - DRIVER: 'earliest' does not work with CUDA 12.9.1 | ||
| # - DRIVER: a custom version is not supported with FLAVOR=wsl on Linux. | ||
|
|
||
| linux: | ||
| pull-request: | ||
|
|
@@ -29,10 +38,10 @@ linux: | |
| - { ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'v100', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { ARCH: 'amd64', PY_VER: '3.13', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'rtxpro6000', GPU_COUNT: '1', DRIVER: '610.43.02' } | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 't4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { ARCH: 'amd64', PY_VER: '3.14', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: '610.43.02' } | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '12.9.1', LOCAL_CTK: '1', GPU: 't4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.0.2', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { ARCH: 'amd64', PY_VER: '3.14t', CUDA_VER: '13.3.0', LOCAL_CTK: '1', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
|
|
@@ -74,7 +83,7 @@ linux: | |
| - { MODE: 'nightly-pytorch', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', TORCH_VER: '2.9.1', TORCH_CUDA: 'cu130' } | ||
| # nightly-numba-cuda | ||
| - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '580.65.06' } | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| - { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| - { MODE: 'nightly-numba-cuda', ARCH: 'arm64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest' } | ||
| # nightly-standard (arm64 l4×2 — nightly-only per runner team request) | ||
|
|
@@ -113,4 +122,4 @@ windows: | |
| - { MODE: 'nightly-pytorch', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.0.2', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC', TORCH_VER: '2.9.1', TORCH_CUDA: 'cu130' } | ||
| # nightly-numba-cuda | ||
| - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '12.9.1', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } | ||
| - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: 'latest', DRIVER_MODE: 'TCC' } | ||
| - { MODE: 'nightly-numba-cuda', ARCH: 'amd64', PY_VER: '3.12', CUDA_VER: '13.3.0', LOCAL_CTK: '0', GPU: 'l4', GPU_COUNT: '1', DRIVER: '596.36', DRIVER_MODE: 'TCC' } | ||
|
Member
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,58 @@ | ||
| # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| # | ||
| # SPDX-License-Identifier: Apache-2.0 | ||
| # | ||
| # configure_driver_mode.ps1 -- set the NVIDIA driver mode on a Windows CI | ||
| # runner and cycle the display devices so the new mode takes effect | ||
| # without rebooting. Always runs (whether or not install_gpu_driver.ps1 | ||
| # just ran). When install_gpu_driver.ps1 has run, this single device | ||
| # cycle also activates the freshly-installed driver. | ||
| # | ||
| # Inputs (env): | ||
| # DRIVER_MODE One of WDDM, TCC, MCDM. | ||
|
|
||
| function Set-DriverMode { | ||
|
|
||
| # Map matrix DRIVER_MODE to nvidia-smi -fdm code. | ||
| # This assumes we have the prior knowledge on which GPU can use which mode. | ||
| $driver_mode = $env:DRIVER_MODE | ||
| if ($driver_mode -eq "WDDM") { | ||
| Write-Output "Setting driver mode to WDDM..." | ||
| nvidia-smi -fdm 0 | ||
| } elseif ($driver_mode -eq "TCC") { | ||
| Write-Output "Setting driver mode to TCC..." | ||
| nvidia-smi -fdm 1 | ||
| } elseif ($driver_mode -eq "MCDM") { | ||
| Write-Output "Setting driver mode to MCDM..." | ||
| nvidia-smi -fdm 2 | ||
| } else { | ||
| Write-Output "Unknown driver mode: $driver_mode" | ||
| exit 1 | ||
| } | ||
|
|
||
| # Only restart NVIDIA display adapters, not other display devices (e.g. QEMU VGA) | ||
| $nvidia_devices = Get-PnpDevice -Class Display -FriendlyName "NVIDIA*" | ||
| foreach ($device in $nvidia_devices) { | ||
| Write-Output "Restarting device: $($device.FriendlyName) ($($device.InstanceId))" | ||
| pnputil /disable-device "$($device.InstanceId)" | ||
| pnputil /enable-device "$($device.InstanceId)" | ||
| } | ||
|
leofang marked this conversation as resolved.
|
||
|
|
||
| # Poll nvidia-smi until NVML can initialize, or give up after ~60s. | ||
| # A fixed sleep is not enough on slower-coming-back-up multi-GPU rows | ||
| # (e.g. 2x H100 MCDM) where pnputil enable returns before NVML is | ||
| # ready. Pattern borrowed from the runner-team `nvgha-driver.ps1`. | ||
| Write-Output "Waiting for nvidia-smi/NVML to come back up after device cycle..." | ||
| $deadline = (Get-Date).AddSeconds(60) | ||
| do { | ||
| Start-Sleep -Seconds 2 | ||
| & nvidia-smi.exe 2>&1 | Out-Null | ||
| } while ($LASTEXITCODE -ne 0 -and (Get-Date) -lt $deadline) | ||
| if ($LASTEXITCODE -ne 0) { | ||
| Write-Error "nvidia-smi did not return cleanly within 60s of the device cycle" | ||
| exit 1 | ||
| } | ||
| } | ||
|
|
||
| # Run the functions | ||
| Set-DriverMode | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Does this mean we can't test various driver versions under WSL?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Correct. When WSL runners start we're already in the WSL terminal, but the rule is any WSL driver update must be done on the native Windows side, not inside WSL.