From 786a62a34e58d409508e576042c9fd917204b8fc Mon Sep 17 00:00:00 2001 From: Paulius Velesko Date: Mon, 9 Mar 2026 18:19:34 +0200 Subject: [PATCH] Add reproducer test for IGC miscompilation of __align__(64) struct byval args (#1180) Minimal test that demonstrates IGC codegen bug: when __align__(64) structs are passed by value to a kernel and their fields are loaded into double4 vectors via make_double4() and used in arithmetic with other byval double4 parameters, fields s4-s7 incorrectly read the values of s0-s3. This is an upstream IGC bug (chipStar generates correct LLVM IR with byval align 64). --- tests/runtime/CMakeLists.txt | 2 + tests/runtime/TestHeCBenchFluidSim.hip | 106 +++++++++++++++++++++++++ 2 files changed, 108 insertions(+) create mode 100644 tests/runtime/TestHeCBenchFluidSim.hip diff --git a/tests/runtime/CMakeLists.txt b/tests/runtime/CMakeLists.txt index b24c95e30..96e36130b 100644 --- a/tests/runtime/CMakeLists.txt +++ b/tests/runtime/CMakeLists.txt @@ -193,3 +193,5 @@ add_test(NAME TestDefaultBackend_NoBE ${CMAKE_CURRENT_BINARY_DIR}/TestDefaultBackend) set_tests_properties(TestDefaultBackend_NoBE PROPERTIES PASS_REGULAR_EXPRESSION "PASS") + +add_hip_runtime_test(TestHeCBenchFluidSim.hip) diff --git a/tests/runtime/TestHeCBenchFluidSim.hip b/tests/runtime/TestHeCBenchFluidSim.hip new file mode 100644 index 000000000..c051f6955 --- /dev/null +++ b/tests/runtime/TestHeCBenchFluidSim.hip @@ -0,0 +1,106 @@ +/// \file TestHeCBenchFluidSim.hip +/// Minimal reproducer for chipStar issue #1180: +/// HeCBench fluidSim benchmark fails due to IGC miscompilation +/// of __align__(64) struct fields when used in complex expressions +/// with make_double4 and arithmetic involving other byval parameters. +/// +/// The bug: when an __align__(64) struct passed by value to a kernel has +/// its fields loaded into double4 vectors AND those vectors are used in +/// arithmetic with other byval double4 parameters, the IGC produces +/// incorrect code that reads the wrong struct field values (s4-s7 get +/// the values of s0-s3). + +#include +#include +#include + +typedef struct __align__(64) { + double s0, s1, s2, s3, s4, s5, s6, s7; +} double8; + +// Reproduces the fluidSim LBM kernel pattern: +// - Two double8 structs passed by value +// - A pointer arg and scalar args +// - double4 temporaries built from struct fields via make_double4 +// - Arithmetic combining the double4 temps with other byval double4 args +__global__ void lbmKernel( + double *out, + const double8 dirX, + const double8 dirY, + const double *__restrict__ weight, + double omega, + double f0, double4 f1234, double4 f5678) { + // Build double4 vectors from struct fields (triggers the IGC bug) + double4 temp = make_double4(f1234.x + f5678.x, f1234.y + f5678.y, + f1234.z + f5678.z, f1234.w + f5678.w); + double rho = f0 + temp.x + temp.y + temp.z + temp.w; + + double4 x1234 = make_double4(dirX.s0, dirX.s1, dirX.s2, dirX.s3); + double4 x5678 = make_double4(dirX.s4, dirX.s5, dirX.s6, dirX.s7); + double4 y1234 = make_double4(dirY.s0, dirY.s1, dirY.s2, dirY.s3); + double4 y5678 = make_double4(dirY.s4, dirY.s5, dirY.s6, dirY.s7); + + // Use the double4 vectors in arithmetic (required to trigger the bug) + double ux = ((f1234.x * x1234.x + f1234.y * x1234.y) + + (f1234.z * x1234.z + f1234.w * x1234.w) + + (f5678.x * x5678.x + f5678.y * x5678.y) + + (f5678.z * x5678.z + f5678.w * x5678.w)) / rho; + double uy = ((f1234.x * y1234.x + f1234.y * y1234.y) + + (f1234.z * y1234.z + f1234.w * y1234.w) + + (f5678.x * y5678.x + f5678.y * y5678.y) + + (f5678.z * y5678.z + f5678.w * y5678.w)) / rho; + + out[0] = rho; + out[1] = ux; + out[2] = uy; + // Output the struct field values the kernel actually sees + out[3] = dirX.s0; out[4] = dirX.s1; out[5] = dirX.s2; out[6] = dirX.s3; + out[7] = dirX.s4; out[8] = dirX.s5; out[9] = dirX.s6; out[10] = dirX.s7; + out[11] = dirY.s0; out[12] = dirY.s1; out[13] = dirY.s2; out[14] = dirY.s3; + out[15] = dirY.s4; out[16] = dirY.s5; out[17] = dirY.s6; out[18] = dirY.s7; +} + +int main() { + // Direction vectors from the fluidSim benchmark (LBM D2Q9) + double8 dirX, dirY; + dirX.s0 = 1.0; dirX.s1 = 0.0; dirX.s2 = -1.0; dirX.s3 = 0.0; + dirX.s4 = 1.0; dirX.s5 = -1.0; dirX.s6 = -1.0; dirX.s7 = 1.0; + dirY.s0 = 0.0; dirY.s1 = 1.0; dirY.s2 = 0.0; dirY.s3 = -1.0; + dirY.s4 = 1.0; dirY.s5 = 1.0; dirY.s6 = -1.0; dirY.s7 = -1.0; + + double *d_out, h_out[20]; + double *d_weight; + hipMalloc(&d_out, 20 * sizeof(double)); + hipMalloc(&d_weight, 9 * sizeof(double)); + + double w[9] = {4.0/9, 1.0/9, 1.0/9, 1.0/9, 1.0/9, + 1.0/36, 1.0/36, 1.0/36, 1.0/36}; + hipMemcpy(d_weight, w, 9 * sizeof(double), hipMemcpyHostToDevice); + + double4 f1234 = make_double4(1.0, 1.0, 1.0, 1.0); + double4 f5678 = make_double4(0.5, 0.5, 0.5, 0.5); + + lbmKernel<<<1, 1>>>(d_out, dirX, dirY, d_weight, 1.2, 5.0, f1234, f5678); + hipMemcpy(h_out, d_out, 20 * sizeof(double), hipMemcpyDeviceToHost); + + double expX[] = {1.0, 0.0, -1.0, 0.0, 1.0, -1.0, -1.0, 1.0}; + double expY[] = {0.0, 1.0, 0.0, -1.0, 1.0, 1.0, -1.0, -1.0}; + + bool pass = true; + for (int i = 0; i < 8; i++) { + if (h_out[3 + i] != expX[i]) { + printf("dirX.s%d = %g, expected %g\n", i, h_out[3 + i], expX[i]); + pass = false; + } + if (h_out[11 + i] != expY[i]) { + printf("dirY.s%d = %g, expected %g\n", i, h_out[11 + i], expY[i]); + pass = false; + } + } + + printf("%s\n", pass ? "PASS" : "FAIL"); + + hipFree(d_out); + hipFree(d_weight); + return pass ? 0 : 1; +}