From 170c799b742c3d3433d28e56eca6b5ec923c80b3 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Sat, 13 Jun 2026 01:28:40 +0000 Subject: [PATCH 1/3] ci: replace 'cancelled||failure' aggregator with CCCL-pattern check_result Fixes #2208. The previous `Check job status` body only treated a dep's `result == 'cancelled' || == 'failure'` as failure, letting `'skipped'` slip through silently. When a `build-*` job fails, the dependent `test-*` job is set to `'skipped'` by default needs-failure propagation, and the aggregator passes -- exactly the case demonstrated by #2209. Adopt CCCL's `check_result` pattern: explicit `expected="success"` per dependency, with `expected="skipped"` for legitimate `[doc-only]` skips, and an early short-circuit for `[no-ci]`. Now any deviation from the expected status (including `'skipped'` from a failed upstream) fails the aggregator. Reference: NVIDIA/cccl ci-workflow-pull-request.yml L463-L526. --- .github/workflows/ci.yml | 84 ++++++++++++++++++---------------------- 1 file changed, 37 insertions(+), 47 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d15438d5689..338e664bf10 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -432,52 +432,42 @@ jobs: steps: - name: Exit run: | - # if any dependencies were cancelled or failed, that's a failure - # - # see https://docs.github.com/en/actions/reference/workflows-and-actions/expressions#always - # and https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/collaborating-on-repositories-with-code-quality-features/troubleshooting-required-status-checks#handling-skipped-but-required-checks - # for why this cannot be encoded in the job-level `if:` field - # - # TL; DR: `$REASONS` - # - # The intersection of skipped-as-success and required status checks - # creates a scenario where if you DON'T `always()` run this job, the - # status check UI will block merging and if you DO `always()` run and - # a dependency is _cancelled_ (due to a critical failure, which is - # somehow not considered a failure ¯\_(ツ)_/¯) then the critically - # failing job(s) will timeout causing a cancellation here and the - # build to succeed which we don't want (originally this was just - # 'exit 0') - # - # Note: When [doc-only] is in PR title, test jobs are intentionally - # skipped and should not cause failure. - # - # detect-changes gates whether heavy test matrices run at all; if it - # does not succeed, downstream test jobs are skipped rather than - # failed, which would otherwise go unnoticed here. Require its - # success explicitly so a broken gating step cannot masquerade as a - # green CI run. - doc_only=${{ needs.should-skip.outputs.doc-only }} - if ${{ needs.detect-changes.result != 'success' }}; then - exit 1 - fi - if ${{ needs.doc.result == 'cancelled' || needs.doc.result == 'failure' }}; then - exit 1 + # GitHub treats `result == 'skipped'` as success for required + # status checks (see CCCL gate comment + cccl#605). The previous + # `cancelled || failure` predicate let upstream build failures + # propagate as `skipped` on downstream test jobs and silently + # pass this aggregator. Adopt CCCL's `check_result` pattern: + # require an explicit `expected` status per dependency, where + # anything else (including `skipped` from a failed upstream) + # fails the gate. `if: always()` on the job still ensures this + # step runs even when needs are skipped. + if [[ "${{ needs.should-skip.outputs.skip }}" == "true" ]]; then + echo "[no-ci] - skipping aggregator checks" + exit 0 fi - if ${{ needs.test-sdist-linux.result == 'cancelled' || - needs.test-sdist-linux.result == 'failure' || - needs.test-sdist-windows.result == 'cancelled' || - needs.test-sdist-windows.result == 'failure' }}; then - exit 1 - fi - if [[ "${doc_only}" != "true" ]]; then - if ${{ needs.test-linux-64.result == 'cancelled' || - needs.test-linux-64.result == 'failure' || - needs.test-linux-aarch64.result == 'cancelled' || - needs.test-linux-aarch64.result == 'failure' || - needs.test-windows.result == 'cancelled' || - needs.test-windows.result == 'failure' }}; then - exit 1 + + doc_only="${{ needs.should-skip.outputs.doc-only }}" + status="success" + check_result() { + name=$1; expected=$2; result=$3 + echo "Checking $name: result='$result' (expected '$expected')" + if [[ "$result" != "$expected" ]]; then + echo "::error::$name did not match expected result" + status="failed" fi - fi - exit 0 + } + + # always expected to succeed (even in [doc-only] mode) + check_result "should-skip" "success" "${{ needs.should-skip.result }}" + check_result "detect-changes" "success" "${{ needs.detect-changes.result }}" + check_result "doc" "success" "${{ needs.doc.result }}" + + # [doc-only] flips these from 'success' to 'skipped' + if [[ "$doc_only" == "true" ]]; then expected="skipped"; else expected="success"; fi + check_result "test-sdist-linux" "$expected" "${{ needs.test-sdist-linux.result }}" + check_result "test-sdist-windows" "$expected" "${{ needs.test-sdist-windows.result }}" + check_result "test-linux-64" "$expected" "${{ needs.test-linux-64.result }}" + check_result "test-linux-aarch64" "$expected" "${{ needs.test-linux-aarch64.result }}" + check_result "test-windows" "$expected" "${{ needs.test-windows.result }}" + + [[ "$status" == "success" ]] From 811388c99e430192c2116aeaa12862e33676cbe7 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 16 Jun 2026 01:36:41 +0000 Subject: [PATCH 2/3] [demo] Re-inject env-vars exit 7 to validate gating-check fix Mirrors the #2209 reproducer on this branch's new aggregator. Expected outcome: - All Build * matrix entries fail at the env-vars build step. - Downstream Test * jobs are skipped (cascaded needs-failure). - Check job status now reports failure -- the symmetric, post-fix counterpart to #2209 (where Check job status reported success despite the same set of failures). DO NOT MERGE while this commit is present. Remove before merge. Refs #2208, #2209. --- ci/tools/env-vars | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/ci/tools/env-vars b/ci/tools/env-vars index 30fac1cdce8..0255af32112 100755 --- a/ci/tools/env-vars +++ b/ci/tools/env-vars @@ -15,6 +15,16 @@ if [[ ${#} -ne 1 ]]; then exit 1 fi +# [demo] Intentional failure to validate the gating-check fix in this PR. +# Mirrors the #2209 reproducer: every Build matrix entry should fail here, +# downstream Test jobs should be skipped, and -- with the new aggregator -- +# Check job status should now report failure (whereas it reported success +# in #2209 before the fix). REMOVE THIS BLOCK before merging. +if [[ "${1}" == "build" ]]; then + echo "::error::INTENTIONAL FAILURE validating gating-check fix on PR #2210. See https://github.com/NVIDIA/cuda-python/issues/2208" >&2 + exit 7 +fi + PYTHON_VERSION_FORMATTED=$(echo "${PY_VER}" | tr -d '.') if [[ "${HOST_PLATFORM}" == linux* ]]; then From 7d31a0433f1759f59d74e2b0fddc6b49bfa0ead6 Mon Sep 17 00:00:00 2001 From: Leo Fang Date: Tue, 16 Jun 2026 01:57:47 +0000 Subject: [PATCH 3/3] Revert "[demo] Re-inject env-vars exit 7 to validate gating-check fix" This reverts commit 811388c99e430192c2116aeaa12862e33676cbe7. --- ci/tools/env-vars | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/ci/tools/env-vars b/ci/tools/env-vars index 0255af32112..30fac1cdce8 100755 --- a/ci/tools/env-vars +++ b/ci/tools/env-vars @@ -15,16 +15,6 @@ if [[ ${#} -ne 1 ]]; then exit 1 fi -# [demo] Intentional failure to validate the gating-check fix in this PR. -# Mirrors the #2209 reproducer: every Build matrix entry should fail here, -# downstream Test jobs should be skipped, and -- with the new aggregator -- -# Check job status should now report failure (whereas it reported success -# in #2209 before the fix). REMOVE THIS BLOCK before merging. -if [[ "${1}" == "build" ]]; then - echo "::error::INTENTIONAL FAILURE validating gating-check fix on PR #2210. See https://github.com/NVIDIA/cuda-python/issues/2208" >&2 - exit 7 -fi - PYTHON_VERSION_FORMATTED=$(echo "${PY_VER}" | tr -d '.') if [[ "${HOST_PLATFORM}" == linux* ]]; then