parca-dev
diff --git a/‎.gitignore‎
Lines changed: 2 additions & 0 deletions b/‎.gitignore‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎Makefile‎
Lines changed: 14 additions & 29 deletions b/‎Makefile‎
Lines changed: 14 additions & 29 deletions
diff --git a/‎microbenchmarks/pc_sample_toy.cu‎
Lines changed: 110 additions & 0 deletions b/‎microbenchmarks/pc_sample_toy.cu‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎microbenchmarks/rapid_launch.cu‎
Lines changed: 44 additions & 0 deletions b/‎microbenchmarks/rapid_launch.cu‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎parcagpu.bt‎
Lines changed: 10 additions & 29 deletions b/‎parcagpu.bt‎
Lines changed: 10 additions & 29 deletions
@@ -5,3 +5,5 @@ test/bpf/activity_parser
 test/bpf/activityparser_*.go
 test/bpf/activityparser_*.o
 src/probes.h
+microbenchmarks/rapid_launch
+microbenchmarks/pc_sample_toy
@@ -1,4 +1,4 @@
-.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug bpf-test test-multi test-pc-real
+.PHONY: all clean test build-amd64 build-arm64 build-all cross docker-push docker-test-build docker-test-run format local debug bpf-test microbenchmarks test-multi test-pc-real
 
 LIB_NAME = libparcagpucupti.so
 
@@ -104,6 +104,17 @@ docker-test-run: docker-test-build
 	@echo "=== Running tests in container ==="
 	@docker run --rm parcagpu-test:latest $(ARGS)
 
+# Build microbenchmark CUDA toys (with DWARF debug info for cubin symbolization)
+NVCC ?= nvcc
+CUDA_ARCH ?= native
+MICROBENCH_SRCS := $(wildcard microbenchmarks/*.cu)
+MICROBENCH_BINS := $(MICROBENCH_SRCS:.cu=)
+
+microbenchmarks: $(MICROBENCH_BINS)
+
+microbenchmarks/%: microbenchmarks/%.cu
+	$(NVCC) -g -lineinfo -arch=$(CUDA_ARCH) -o $@ $<
+
 # Build the BPF activity parser test program
 # Requires: clang, libbpf-dev, bpftool (for vmlinux.h), Go 1.21+
 bpf-test:
@@ -137,34 +148,8 @@ test-multi: local bpf-test
 
 # Run pc_sample_toy with BPF activity parser and verify stall reason map is received.
 # Requires: real GPU, root (sudo) for BPF, pc_sample_toy compiled separately.
-test-pc-real: local bpf-test
-	@echo "=== Running PC sampling smoke test ==="
-	@LIB_PATH="$$(pwd)/build-local/lib/libparcagpucupti.so"; \
-	TOY="$$(pwd)/microbenchmarks/pc_sample_toy"; \
-	if [ ! -x "$$TOY" ]; then \
-		echo "error: $$TOY not found — compile with: /usr/local/cuda/bin/nvcc -o microbenchmarks/pc_sample_toy microbenchmarks/pc_sample_toy.cu" >&2; \
-		exit 1; \
-	fi; \
-	PARCAGPU_SAMPLING_FACTOR=18 CUDA_INJECTION64_PATH="$$LIB_PATH" "$$TOY" 3 & \
-	TOY_PID=$$!; \
-	echo "pc_sample_toy PID: $${TOY_PID}"; \
-	while kill -0 $${TOY_PID} 2>/dev/null && ! grep -q libparcagpucupti "/proc/$${TOY_PID}/maps" 2>/dev/null; do \
-		sleep 0.1; \
-	done; \
-	echo "Starting BPF activity parser (requires root)..."; \
-	sudo test/bpf/activity_parser -pid $${TOY_PID} -lib "$$LIB_PATH" -v 2>&1 | tee /tmp/parcagpu-pc-test.log & \
-	BPF_PID=$$!; \
-	wait $${TOY_PID} 2>/dev/null; \
-	sleep 1; \
-	sudo kill $${BPF_PID} 2>/dev/null; wait $${BPF_PID} 2>/dev/null; \
-	echo; \
-	if grep -q 'stall reason map:' /tmp/parcagpu-pc-test.log && \
-	   grep -q 'smsp__pcsamp' /tmp/parcagpu-pc-test.log; then \
-		echo "=== PASS: stall reason map received ==="; \
-	else \
-		echo "=== FAIL: stall reason map not found in output ===" >&2; \
-		exit 1; \
-	fi
+test-pc-real: local bpf-test microbenchmarks
+	sudo -E test/test-pc-real.sh
 
 format:
 	@echo "=== Formatting source files ==="
 
@@ -0,0 +1,110 @@
+// pc_sample_toy.cu — a simple GPU busy-loop for testing PC sampling
+// Compile: make microbenchmarks  (or: nvcc -g -lineinfo -arch=native -o pc_sample_toy pc_sample_toy.cu)
+// Run:     ./pc_sample_toy
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#define CHECK(call)                                                            \
+  do {                                                                         \
+    cudaError_t err = (call);                                                  \
+    if (err != cudaSuccess) {                                                  \
+      fprintf(stderr, "CUDA error at %s:%d: %s\n", __FILE__, __LINE__,         \
+              cudaGetErrorString(err));                                        \
+      exit(1);                                                                 \
+    }                                                                          \
+  } while (0)
+
+// Kernel A: heavy FP math (sin/cos chain)
+__global__ void trig_storm(float *out, int n, unsigned long long iters) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= n)
+    return;
+
+  float x = (float)idx * 0.001f;
+  for (unsigned long long i = 0; i < iters; i++) {
+    x = sinf(x) * cosf(x) + 0.1f;
+  }
+  out[idx] = x;
+}
+
+// Kernel B: integer bit-twiddling
+__global__ void hash_churn(unsigned int *out, int n, unsigned long long iters) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx >= n)
+    return;
+
+  unsigned int h = idx ^ 0xdeadbeef;
+  for (unsigned long long i = 0; i < iters; i++) {
+    h ^= h << 13;
+    h ^= h >> 17;
+    h ^= h << 5;
+    h += (unsigned int)i;
+  }
+  out[idx] = h;
+}
+
+// Kernel C: shared-memory bouncing
+__global__ void shmem_bounce(float *out, int n, unsigned long long iters) {
+  __shared__ float tile[256];
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  int tid = threadIdx.x;
+
+  tile[tid] = (float)idx;
+  __syncthreads();
+
+  for (unsigned long long i = 0; i < iters; i++) {
+    tile[tid] += tile[(tid + 1) % blockDim.x] * 0.01f;
+    __syncthreads();
+  }
+
+  if (idx < n)
+    out[idx] = tile[tid];
+}
+
+void go() {
+  const int N = 1 << 18; // 256K elements
+  const int threads = 256;
+  const int blocks = (N + threads - 1) / threads;
+
+  float *d_float;
+  unsigned int *d_uint;
+
+  CHECK(cudaMalloc(&d_float, N * sizeof(float)));
+  CHECK(cudaMalloc(&d_uint, N * sizeof(unsigned int)));
+
+  printf("Launching GPU kernels — attach your profiler now.\n");
+  printf("PID: %d\n\n", getpid());
+
+  sleep(1);
+  // Each kernel runs for roughly 0.5–1 second depending on GPU.
+  // Tune the iteration count up/down as needed.
+
+  printf("  [1/3] trig_storm ...\n");
+  trig_storm<<<blocks, threads>>>(d_float, N, 500000ULL);
+  CHECK(cudaDeviceSynchronize());
+
+  printf("  [2/3] hash_churn ...\n");
+  hash_churn<<<blocks, threads>>>(d_uint, N, 2000000ULL);
+  CHECK(cudaDeviceSynchronize());
+
+  printf("  [3/3] shmem_bounce ...\n");
+  shmem_bounce<<<blocks, threads>>>(d_float, N, 50000ULL);
+  CHECK(cudaDeviceSynchronize());
+
+  printf("\nDone.\n");
+
+  CHECK(cudaFree(d_float));
+  CHECK(cudaFree(d_uint));
+}
+
+int main(int argc, char **argv) {
+  int loops = 1;
+  if (argc > 1) {
+    loops = atoi(argv[1]);
+  }
+  while (loops-- > 0) {
+    go();
+  }
+}
@@ -0,0 +1,44 @@
+// rapid_launch.cu — measures per-kernel-launch overhead from CUPTI injection.
+// Launches many tiny kernels to stress the callback path.
+//
+// Compile: nvcc -o rapid_launch rapid_launch.cu
+// Run:     ./rapid_launch [num_launches]
+//
+// Compare:
+//   ./rapid_launch 50000                                          # baseline
+//   CUDA_INJECTION64_PATH=.../libparcagpucupti.so ./rapid_launch 50000  # injected
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <time.h>
+
+__global__ void empty_kernel() {}
+
+static double now_sec() {
+  struct timespec ts;
+  clock_gettime(CLOCK_MONOTONIC, &ts);
+  return ts.tv_sec + ts.tv_nsec * 1e-9;
+}
+
+int main(int argc, char **argv) {
+  int n = 50000;
+  if (argc > 1)
+    n = atoi(argv[1]);
+
+  // Warm up the CUDA context and any injection library init.
+  empty_kernel<<<1, 1>>>();
+  cudaDeviceSynchronize();
+
+  // Synchronous launches — each one round-trips through CUPTI callbacks.
+  double t0 = now_sec();
+  for (int i = 0; i < n; i++) {
+    empty_kernel<<<1, 1>>>();
+    cudaDeviceSynchronize();
+  }
+  double t1 = now_sec();
+
+  double elapsed = t1 - t0;
+  printf("%d launches in %.3f s  (%.1f us/launch)\n", n, elapsed,
+         elapsed / n * 1e6);
+  return 0;
+}
@@ -38,36 +38,15 @@ usdt:*:parcagpu:cuda_correlation {
     printf("%-12s.%-6u [CORR] %u: cbid=%u %s\n", strftime("%H:%M:%S", nsecs), (nsecs % 1000000000) / 1000, $correlation_id, $cbid, $name);
 }
 
-usdt:*:parcagpu:pc_sample_summary {
-    $function_index = arg0;
-    $pc_offset = arg1;
-    $total_samples = arg2;
-    $stalled_samples = arg3;
-    $function_name = str(arg4);
-
-    printf("%-12s.%-6u [PC_SAMPLE] func=%u pc=0x%lx total=%lu stalled=%lu %s\n",
-           strftime("%H:%M:%S", nsecs),
-           (nsecs % 1000000000) / 1000,
-           $function_index,
-           $pc_offset,
-           $total_samples,
-           $stalled_samples,
-           $function_name);
-}
-
-usdt:*:parcagpu:pc_stall_reason {
-    $function_index = arg0;
-    $pc_offset = arg1;
-    $stall_reason_index = arg2;
-    $samples = arg3;
+usdt:*:parcagpu:pc_sample_batch {
+    $records = arg0;
+    $count = arg1;
 
-    printf("%-12s.%-6u [STALL]     func=%u pc=0x%lx reason[%u] samples=%lu\n",
+    printf("%-12s.%-6u [PC_BATCH] count=%u records=%p\n",
            strftime("%H:%M:%S", nsecs),
            (nsecs % 1000000000) / 1000,
-           $function_index,
-           $pc_offset,
-           $stall_reason_index,
-           $samples);
+           $count,
+           $records);
 }
 
 usdt:*:parcagpu:stall_reason_map {
@@ -83,11 +62,13 @@ usdt:*:parcagpu:stall_reason_map {
 
 usdt:*:parcagpu:cubin_loaded {
     $cubin_crc = arg0;
+    $cubin_size = arg2;
 
-    printf("%-12s.%-6u [CUBIN_LOAD] crc=0x%lx\n",
+    printf("%-12s.%-6u [CUBIN_LOAD] crc=0x%lx size=%lu\n",
            strftime("%H:%M:%S", nsecs),
            (nsecs % 1000000000) / 1000,
-           $cubin_crc);
+           $cubin_crc,
+           $cubin_size);
 }
 
 usdt:*:parcagpu:cubin_unloaded {