Skip to content

Commit 6496aa7

Browse files
committed
Flush out cubin processing, sass lookup and pc sampling probe batching
1 parent f87ed73 commit 6496aa7

File tree

15 files changed

+1069
-206
lines changed

15 files changed

+1069
-206
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,3 +5,5 @@ test/bpf/activity_parser
55
test/bpf/activityparser_*.go
66
test/bpf/activityparser_*.o
77
src/probes.h
8+
microbenchmarks/rapid_launch
9+
microbenchmarks/pc_sample_toy

Makefile

Lines changed: 14 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug bpf-test test-multi test-pc-real
1+
.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug bpf-test microbenchmarks test-multi test-pc-real
22

33
LIB_NAME = libparcagpucupti.so
44

@@ -104,6 +104,17 @@ docker-test-run: docker-test-build
104104
@echo "=== Running tests in container ==="
105105
@docker run --rm parcagpu-test:latest $(ARGS)
106106

107+
# Build microbenchmark CUDA toys (with DWARF debug info for cubin symbolization)
108+
NVCC ?= nvcc
109+
CUDA_ARCH ?= native
110+
MICROBENCH_SRCS := $(wildcard microbenchmarks/*.cu)
111+
MICROBENCH_BINS := $(MICROBENCH_SRCS:.cu=)
112+
113+
microbenchmarks: $(MICROBENCH_BINS)
114+
115+
microbenchmarks/%: microbenchmarks/%.cu
116+
$(NVCC) -g -lineinfo -arch=$(CUDA_ARCH) -o $@ $<
117+
107118
# Build the BPF activity parser test program
108119
# Requires: clang, libbpf-dev, bpftool (for vmlinux.h), Go 1.21+
109120
bpf-test:
@@ -137,34 +148,8 @@ test-multi: local bpf-test
137148

138149
# Run pc_sample_toy with BPF activity parser and verify stall reason map is received.
139150
# Requires: real GPU, root (sudo) for BPF, pc_sample_toy compiled separately.
140-
test-pc-real: local bpf-test
141-
@echo "=== Running PC sampling smoke test ==="
142-
@LIB_PATH="$$(pwd)/build-local/lib/libparcagpucupti.so"; \
143-
TOY="$$(pwd)/microbenchmarks/pc_sample_toy"; \
144-
if [ ! -x "$$TOY" ]; then \
145-
echo "error: $$TOY not found — compile with: /usr/local/cuda/bin/nvcc -o microbenchmarks/pc_sample_toy microbenchmarks/pc_sample_toy.cu" >&2; \
146-
exit 1; \
147-
fi; \
148-
PARCAGPU_SAMPLING_FACTOR=18 CUDA_INJECTION64_PATH="$$LIB_PATH" "$$TOY" 3 & \
149-
TOY_PID=$$!; \
150-
echo "pc_sample_toy PID: $${TOY_PID}"; \
151-
while kill -0 $${TOY_PID} 2>/dev/null && ! grep -q libparcagpucupti "/proc/$${TOY_PID}/maps" 2>/dev/null; do \
152-
sleep 0.1; \
153-
done; \
154-
echo "Starting BPF activity parser (requires root)..."; \
155-
sudo test/bpf/activity_parser -pid $${TOY_PID} -lib "$$LIB_PATH" -v 2>&1 | tee /tmp/parcagpu-pc-test.log & \
156-
BPF_PID=$$!; \
157-
wait $${TOY_PID} 2>/dev/null; \
158-
sleep 1; \
159-
sudo kill $${BPF_PID} 2>/dev/null; wait $${BPF_PID} 2>/dev/null; \
160-
echo; \
161-
if grep -q 'stall reason map:' /tmp/parcagpu-pc-test.log && \
162-
grep -q 'smsp__pcsamp' /tmp/parcagpu-pc-test.log; then \
163-
echo "=== PASS: stall reason map received ==="; \
164-
else \
165-
echo "=== FAIL: stall reason map not found in output ===" >&2; \
166-
exit 1; \
167-
fi
151+
test-pc-real: local bpf-test microbenchmarks
152+
sudo -E test/test-pc-real.sh
168153

169154
format:
170155
@echo "=== Formatting source files ==="

microbenchmarks/pc_sample_toy.cu

Lines changed: 110 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,110 @@
1+
// pc_sample_toy.cu — a simple GPU busy-loop for testing PC sampling
2+
// Compile: make microbenchmarks (or: nvcc -g -lineinfo -arch=native -o pc_sample_toy pc_sample_toy.cu)
3+
// Run: ./pc_sample_toy
4+
5+
#include <cuda_runtime.h>
6+
#include <stdio.h>
7+
#include <unistd.h>
8+
9+
#define CHECK(call) \
10+
do { \
11+
cudaError_t err = (call); \
12+
if (err != cudaSuccess) { \
13+
fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__, \
14+
cudaGetErrorString(err)); \
15+
exit(1); \
16+
} \
17+
} while (0)
18+
19+
// Kernel A: heavy FP math (sin/cos chain)
20+
__global__ void trig_storm(float *out, int n, unsigned long long iters) {
21+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
22+
if (idx >= n)
23+
return;
24+
25+
float x = (float)idx * 0.001f;
26+
for (unsigned long long i = 0; i < iters; i++) {
27+
x = sinf(x) * cosf(x) + 0.1f;
28+
}
29+
out[idx] = x;
30+
}
31+
32+
// Kernel B: integer bit-twiddling
33+
__global__ void hash_churn(unsigned int *out, int n, unsigned long long iters) {
34+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
35+
if (idx >= n)
36+
return;
37+
38+
unsigned int h = idx ^ 0xdeadbeef;
39+
for (unsigned long long i = 0; i < iters; i++) {
40+
h ^= h << 13;
41+
h ^= h >> 17;
42+
h ^= h << 5;
43+
h += (unsigned int)i;
44+
}
45+
out[idx] = h;
46+
}
47+
48+
// Kernel C: shared-memory bouncing
49+
__global__ void shmem_bounce(float *out, int n, unsigned long long iters) {
50+
__shared__ float tile[256];
51+
int idx = blockIdx.x * blockDim.x + threadIdx.x;
52+
int tid = threadIdx.x;
53+
54+
tile[tid] = (float)idx;
55+
__syncthreads();
56+
57+
for (unsigned long long i = 0; i < iters; i++) {
58+
tile[tid] += tile[(tid + 1) % blockDim.x] * 0.01f;
59+
__syncthreads();
60+
}
61+
62+
if (idx < n)
63+
out[idx] = tile[tid];
64+
}
65+
66+
void go() {
67+
const int N = 1 << 18; // 256K elements
68+
const int threads = 256;
69+
const int blocks = (N + threads - 1) / threads;
70+
71+
float *d_float;
72+
unsigned int *d_uint;
73+
74+
CHECK(cudaMalloc(&d_float, N * sizeof(float)));
75+
CHECK(cudaMalloc(&d_uint, N * sizeof(unsigned int)));
76+
77+
printf("Launching GPU kernels — attach your profiler now.\n");
78+
printf("PID: %d\n\n", getpid());
79+
80+
sleep(1);
81+
// Each kernel runs for roughly 0.5–1 second depending on GPU.
82+
// Tune the iteration count up/down as needed.
83+
84+
printf(" [1/3] trig_storm ...\n");
85+
trig_storm<<<blocks, threads>>>(d_float, N, 500000ULL);
86+
CHECK(cudaDeviceSynchronize());
87+
88+
printf(" [2/3] hash_churn ...\n");
89+
hash_churn<<<blocks, threads>>>(d_uint, N, 2000000ULL);
90+
CHECK(cudaDeviceSynchronize());
91+
92+
printf(" [3/3] shmem_bounce ...\n");
93+
shmem_bounce<<<blocks, threads>>>(d_float, N, 50000ULL);
94+
CHECK(cudaDeviceSynchronize());
95+
96+
printf("\nDone.\n");
97+
98+
CHECK(cudaFree(d_float));
99+
CHECK(cudaFree(d_uint));
100+
}
101+
102+
int main(int argc, char **argv) {
103+
int loops = 1;
104+
if (argc > 1) {
105+
loops = atoi(argv[1]);
106+
}
107+
while (loops-- > 0) {
108+
go();
109+
}
110+
}

microbenchmarks/rapid_launch.cu

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
// rapid_launch.cu — measures per-kernel-launch overhead from CUPTI injection.
2+
// Launches many tiny kernels to stress the callback path.
3+
//
4+
// Compile: nvcc -o rapid_launch rapid_launch.cu
5+
// Run: ./rapid_launch [num_launches]
6+
//
7+
// Compare:
8+
// ./rapid_launch 50000 # baseline
9+
// CUDA_INJECTION64_PATH=.../libparcagpucupti.so ./rapid_launch 50000 # injected
10+
11+
#include <cuda_runtime.h>
12+
#include <stdio.h>
13+
#include <time.h>
14+
15+
__global__ void empty_kernel() {}
16+
17+
static double now_sec() {
18+
struct timespec ts;
19+
clock_gettime(CLOCK_MONOTONIC, &ts);
20+
return ts.tv_sec + ts.tv_nsec * 1e-9;
21+
}
22+
23+
int main(int argc, char **argv) {
24+
int n = 50000;
25+
if (argc > 1)
26+
n = atoi(argv[1]);
27+
28+
// Warm up the CUDA context and any injection library init.
29+
empty_kernel<<<1, 1>>>();
30+
cudaDeviceSynchronize();
31+
32+
// Synchronous launches — each one round-trips through CUPTI callbacks.
33+
double t0 = now_sec();
34+
for (int i = 0; i < n; i++) {
35+
empty_kernel<<<1, 1>>>();
36+
cudaDeviceSynchronize();
37+
}
38+
double t1 = now_sec();
39+
40+
double elapsed = t1 - t0;
41+
printf("%d launches in %.3f s (%.1f us/launch)\n", n, elapsed,
42+
elapsed / n * 1e6);
43+
return 0;
44+
}

parcagpu.bt

Lines changed: 10 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -38,36 +38,15 @@ usdt:*:parcagpu:cuda_correlation {
3838
printf("%-12s.%-6u [CORR] %u: cbid=%u %s\n", strftime("%H:%M:%S", nsecs), (nsecs % 1000000000) / 1000, $correlation_id, $cbid, $name);
3939
}
4040

41-
usdt:*:parcagpu:pc_sample_summary {
42-
$function_index = arg0;
43-
$pc_offset = arg1;
44-
$total_samples = arg2;
45-
$stalled_samples = arg3;
46-
$function_name = str(arg4);
47-
48-
printf("%-12s.%-6u [PC_SAMPLE] func=%u pc=0x%lx total=%lu stalled=%lu %s\n",
49-
strftime("%H:%M:%S", nsecs),
50-
(nsecs % 1000000000) / 1000,
51-
$function_index,
52-
$pc_offset,
53-
$total_samples,
54-
$stalled_samples,
55-
$function_name);
56-
}
57-
58-
usdt:*:parcagpu:pc_stall_reason {
59-
$function_index = arg0;
60-
$pc_offset = arg1;
61-
$stall_reason_index = arg2;
62-
$samples = arg3;
41+
usdt:*:parcagpu:pc_sample_batch {
42+
$records = arg0;
43+
$count = arg1;
6344

64-
printf("%-12s.%-6u [STALL] func=%u pc=0x%lx reason[%u] samples=%lu\n",
45+
printf("%-12s.%-6u [PC_BATCH] count=%u records=%p\n",
6546
strftime("%H:%M:%S", nsecs),
6647
(nsecs % 1000000000) / 1000,
67-
$function_index,
68-
$pc_offset,
69-
$stall_reason_index,
70-
$samples);
48+
$count,
49+
$records);
7150
}
7251

7352
usdt:*:parcagpu:stall_reason_map {
@@ -83,11 +62,13 @@ usdt:*:parcagpu:stall_reason_map {
8362

8463
usdt:*:parcagpu:cubin_loaded {
8564
$cubin_crc = arg0;
65+
$cubin_size = arg2;
8666

87-
printf("%-12s.%-6u [CUBIN_LOAD] crc=0x%lx\n",
67+
printf("%-12s.%-6u [CUBIN_LOAD] crc=0x%lx size=%lu\n",
8868
strftime("%H:%M:%S", nsecs),
8969
(nsecs % 1000000000) / 1000,
90-
$cubin_crc);
70+
$cubin_crc,
71+
$cubin_size);
9172
}
9273

9374
usdt:*:parcagpu:cubin_unloaded {

0 commit comments

Comments
 (0)