|
| 1 | +// pc_sample_toy.cu — a simple GPU busy-loop for testing PC sampling |
| 2 | +// Compile: make microbenchmarks (or: nvcc -g -lineinfo -arch=native -o pc_sample_toy pc_sample_toy.cu) |
| 3 | +// Run: ./pc_sample_toy |
| 4 | + |
| 5 | +#include <cuda_runtime.h> |
| 6 | +#include <stdio.h> |
| 7 | +#include <unistd.h> |
| 8 | + |
| 9 | +#define CHECK(call) \ |
| 10 | + do { \ |
| 11 | + cudaError_t err = (call); \ |
| 12 | + if (err != cudaSuccess) { \ |
| 13 | + fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \ |
| 14 | + cudaGetErrorString(err)); \ |
| 15 | + exit(1); \ |
| 16 | + } \ |
| 17 | + } while (0) |
| 18 | + |
| 19 | +// Kernel A: heavy FP math (sin/cos chain) |
| 20 | +__global__ void trig_storm(float *out, int n, unsigned long long iters) { |
| 21 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; |
| 22 | + if (idx >= n) |
| 23 | + return; |
| 24 | + |
| 25 | + float x = (float)idx * 0.001f; |
| 26 | + for (unsigned long long i = 0; i < iters; i++) { |
| 27 | + x = sinf(x) * cosf(x) + 0.1f; |
| 28 | + } |
| 29 | + out[idx] = x; |
| 30 | +} |
| 31 | + |
| 32 | +// Kernel B: integer bit-twiddling |
| 33 | +__global__ void hash_churn(unsigned int *out, int n, unsigned long long iters) { |
| 34 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; |
| 35 | + if (idx >= n) |
| 36 | + return; |
| 37 | + |
| 38 | + unsigned int h = idx ^ 0xdeadbeef; |
| 39 | + for (unsigned long long i = 0; i < iters; i++) { |
| 40 | + h ^= h << 13; |
| 41 | + h ^= h >> 17; |
| 42 | + h ^= h << 5; |
| 43 | + h += (unsigned int)i; |
| 44 | + } |
| 45 | + out[idx] = h; |
| 46 | +} |
| 47 | + |
| 48 | +// Kernel C: shared-memory bouncing |
| 49 | +__global__ void shmem_bounce(float *out, int n, unsigned long long iters) { |
| 50 | + __shared__ float tile[256]; |
| 51 | + int idx = blockIdx.x * blockDim.x + threadIdx.x; |
| 52 | + int tid = threadIdx.x; |
| 53 | + |
| 54 | + tile[tid] = (float)idx; |
| 55 | + __syncthreads(); |
| 56 | + |
| 57 | + for (unsigned long long i = 0; i < iters; i++) { |
| 58 | + tile[tid] += tile[(tid + 1) % blockDim.x] * 0.01f; |
| 59 | + __syncthreads(); |
| 60 | + } |
| 61 | + |
| 62 | + if (idx < n) |
| 63 | + out[idx] = tile[tid]; |
| 64 | +} |
| 65 | + |
| 66 | +void go() { |
| 67 | + const int N = 1 << 18; // 256K elements |
| 68 | + const int threads = 256; |
| 69 | + const int blocks = (N + threads - 1) / threads; |
| 70 | + |
| 71 | + float *d_float; |
| 72 | + unsigned int *d_uint; |
| 73 | + |
| 74 | + CHECK(cudaMalloc(&d_float, N * sizeof(float))); |
| 75 | + CHECK(cudaMalloc(&d_uint, N * sizeof(unsigned int))); |
| 76 | + |
| 77 | + printf("Launching GPU kernels — attach your profiler now.\n"); |
| 78 | + printf("PID: %d\n\n", getpid()); |
| 79 | + |
| 80 | + sleep(1); |
| 81 | + // Each kernel runs for roughly 0.5–1 second depending on GPU. |
| 82 | + // Tune the iteration count up/down as needed. |
| 83 | + |
| 84 | + printf(" [1/3] trig_storm ...\n"); |
| 85 | + trig_storm<<<blocks, threads>>>(d_float, N, 500000ULL); |
| 86 | + CHECK(cudaDeviceSynchronize()); |
| 87 | + |
| 88 | + printf(" [2/3] hash_churn ...\n"); |
| 89 | + hash_churn<<<blocks, threads>>>(d_uint, N, 2000000ULL); |
| 90 | + CHECK(cudaDeviceSynchronize()); |
| 91 | + |
| 92 | + printf(" [3/3] shmem_bounce ...\n"); |
| 93 | + shmem_bounce<<<blocks, threads>>>(d_float, N, 50000ULL); |
| 94 | + CHECK(cudaDeviceSynchronize()); |
| 95 | + |
| 96 | + printf("\nDone.\n"); |
| 97 | + |
| 98 | + CHECK(cudaFree(d_float)); |
| 99 | + CHECK(cudaFree(d_uint)); |
| 100 | +} |
| 101 | + |
| 102 | +int main(int argc, char **argv) { |
| 103 | + int loops = 1; |
| 104 | + if (argc > 1) { |
| 105 | + loops = atoi(argv[1]); |
| 106 | + } |
| 107 | + while (loops-- > 0) { |
| 108 | + go(); |
| 109 | + } |
| 110 | +} |
0 commit comments