From 38d039439e391e2be54f7ad82e31bd3faa5b8895 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= Date: Fri, 12 Jun 2026 20:51:38 +0200 Subject: [PATCH 1/2] [SPARK-57420][INFRA] Only generate TPC-DS data when required and check CPU compatibility early in benchmark workflow Two improvements to the benchmark workflow: 1. Skip TPC-DS data generation for non-TPCDS benchmarks. Change contains(inputs.class, '*') to inputs.class == '*' so wildcard patterns like '*VectorizedDeltaReaderBenchmark' no longer trigger the expensive TPC-DS generation job (~5-10 min saved per run). 2. Add early CPU model check step that runs immediately after checkout, before compilation. Prints the CPU as a ::notice:: annotation for live visibility, and optionally fails fast if the runner CPU does not match the expected-cpu input parameter. Assisted-by: OpenCode:claude-opus-4.6 --- .github/workflows/benchmark.yml | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 3f20752d4b6a5..088c2d240983b 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -56,6 +56,11 @@ on: description: 'Commit the benchmark results to the current branch' required: true default: false + expected-cpu: + type: string + description: 'Expected CPU model (e.g. "AMD EPYC 7763"). If set, the job fails early when the runner CPU does not match.' + required: false + default: '' jobs: matrix-gen: @@ -73,7 +78,10 @@ jobs: # Any TPC-DS related updates on this job need to be applied to tpcds-1g job of build_and_test.yml as well tpcds-1g-gen: name: "Generate an TPC-DS dataset with SF=1" - if: contains(inputs.class, 'TPCDSQueryBenchmark') || contains(inputs.class, 'LZ4TPCDSDataBenchmark') || contains(inputs.class, 'ZStandardTPCDSDataBenchmark') || contains(inputs.class, '*') + # Only generate TPC-DS data when running TPC-DS benchmarks or all benchmarks (class == '*'). + # Use exact equality instead of contains(inputs.class, '*') to avoid matching wildcard + # patterns like '*VectorizedDeltaReaderBenchmark' that don't need TPC-DS data. + if: contains(inputs.class, 'TPCDSQueryBenchmark') || contains(inputs.class, 'LZ4TPCDSDataBenchmark') || contains(inputs.class, 'ZStandardTPCDSDataBenchmark') || inputs.class == '*' runs-on: ubuntu-latest env: SPARK_LOCAL_IP: localhost @@ -156,6 +164,19 @@ jobs: # In order to get diff files with: fetch-depth: 0 + - name: Check CPU model + run: | + CPU_MODEL=$(grep "model name" /proc/cpuinfo | head -1 | sed 's/model name\s*:\s*//') + echo "Runner CPU: $CPU_MODEL" + echo "::notice::Runner CPU: $CPU_MODEL" + if [ -n "${{ inputs.expected-cpu }}" ]; then + if echo "$CPU_MODEL" | grep -qF "${{ inputs.expected-cpu }}"; then + echo "CPU matches expected: ${{ inputs.expected-cpu }}" + else + echo "::error::CPU mismatch! Expected '${{ inputs.expected-cpu }}' but got '$CPU_MODEL'" + exit 1 + fi + fi - name: Cache SBT and Maven uses: actions/cache@v5 with: @@ -179,7 +200,7 @@ jobs: distribution: zulu java-version: ${{ inputs.jdk }} - name: Cache TPC-DS generated data - if: contains(inputs.class, 'TPCDSQueryBenchmark') || contains(inputs.class, 'LZ4TPCDSDataBenchmark') || contains(inputs.class, 'ZStandardTPCDSDataBenchmark') || contains(inputs.class, '*') + if: contains(inputs.class, 'TPCDSQueryBenchmark') || contains(inputs.class, 'LZ4TPCDSDataBenchmark') || contains(inputs.class, 'ZStandardTPCDSDataBenchmark') || inputs.class == '*' id: cache-tpcds-sf-1 uses: actions/cache@v5 with: From 1b52da52967335c103899d7d74ea59531ee62cf9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Isma=C3=ABl=20Mej=C3=ADa?= Date: Mon, 15 Jun 2026 09:18:43 +0200 Subject: [PATCH 2/2] Route expected-cpu input through env variable for shell safety Address review nit: use step-level env block to pass the inputs.expected-cpu value as a properly-quoted shell variable instead of template-expanding it directly into the script. Assisted-by: OpenCode:claude-opus-4.6 --- .github/workflows/benchmark.yml | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 088c2d240983b..40e021d19c88a 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -165,15 +165,17 @@ jobs: with: fetch-depth: 0 - name: Check CPU model + env: + EXPECTED_CPU: ${{ inputs.expected-cpu }} run: | CPU_MODEL=$(grep "model name" /proc/cpuinfo | head -1 | sed 's/model name\s*:\s*//') echo "Runner CPU: $CPU_MODEL" echo "::notice::Runner CPU: $CPU_MODEL" - if [ -n "${{ inputs.expected-cpu }}" ]; then - if echo "$CPU_MODEL" | grep -qF "${{ inputs.expected-cpu }}"; then - echo "CPU matches expected: ${{ inputs.expected-cpu }}" + if [ -n "$EXPECTED_CPU" ]; then + if echo "$CPU_MODEL" | grep -qF "$EXPECTED_CPU"; then + echo "CPU matches expected: $EXPECTED_CPU" else - echo "::error::CPU mismatch! Expected '${{ inputs.expected-cpu }}' but got '$CPU_MODEL'" + echo "::error::CPU mismatch! Expected '$EXPECTED_CPU' but got '$CPU_MODEL'" exit 1 fi fi