generate_input receives a random seed argument

ngc92 · ngc92 · commit 573882513609 · 2025-01-16T18:42:30.000+02:00
diff --git a/examples/identity_cuda/reference.cuh b/examples/identity_cuda/reference.cuh
@@ -6,6 +6,7 @@
 #include <cstdlib>
 #include <cmath>
 #include <array>
+#include <random>
 #include <iostream>
 
 #define N_SIZES 10
@@ -15,13 +16,16 @@ const int Ns[N_SIZES] = {128,  256,  512,   1024,  2048,
 using input_t = std::array<std::vector<float>, N_SIZES>;
 using output_t = input_t;
 
-input_t generate_input() {
+input_t generate_input(int seed) {
+  std::mt19937 rng(seed);
   input_t data;
 
+  std::uniform_real_distribution<float> dist(0, 1);
+
   for (int i = 0; i < N_SIZES; ++i) {
     data[i].resize(Ns[i]);
     for (int j = 0; j < Ns[i]; ++j) {
-      data[i][j] = static_cast<float>(rand()) / RAND_MAX;
+      data[i][j] = dist(rng);
     }
   }
 
diff --git a/examples/identity_py/reference.py b/examples/identity_py/reference.py
@@ -16,7 +16,7 @@ def ref_kernel(xs: List[torch.Tensor]) -> List[torch.Tensor]:
     return xs
 
 
-def generate_input() -> List[torch.Tensor]:
+def generate_input(seed: int) -> List[torch.Tensor]:
     """
     Generates random input tensor of the specified shape.
     Returns:
@@ -34,8 +34,10 @@ def generate_input() -> List[torch.Tensor]:
         device = torch.device("cpu")
 
     tensors = []
+    rng = torch.Generator(device=device)
+    rng.manual_seed(seed)
     for shape in shapes:
-        tensors.append(torch.randn(shape, device=device))
+        tensors.append(torch.randn(shape, device=device, generator=rng))
 
     return tensors
 
diff --git a/src/discord-cluster-manager/eval.cu b/src/discord-cluster-manager/eval.cu
@@ -52,21 +52,24 @@ static void cuda_check(cudaError_t status, const char* expr, const char* file, i
 
 #define cuda_check(expr) cuda_check(expr, #expr, __FILE__, __LINE__, __FUNCTION__)
 
-void measure_runtime(PopcornOutput& logger) {
+void measure_runtime(PopcornOutput& logger, std::mt19937& rng) {
     std::cout << "warming up..." << std::endl;
 
-    for (int i = 0; i < WARMUP_RUNS; i++) {
-        auto data = generate_input();
-        // discard result; this is just warmup, we don't care what it returns
-        (void)custom_kernel(data);
+    {
+        auto warmup_data = generate_input(rng());
+        for (int i = 0; i < WARMUP_RUNS; i++) {
+            // discard result; this is just warmup, we don't care what it returns
+            (void)custom_kernel(warmup_data);
+            cuda_check(cudaDeviceSynchronize());
+        }
     }
-    cuda_check(cudaDeviceSynchronize());
 
     std::vector<std::int64_t> durations;
     durations.reserve(TIMED_RUNS);
 
     for (int i = 0; i < TIMED_RUNS; i++) {
-        auto data = generate_input();
+        auto data = generate_input(rng());
+
         // make a copy of the input data to be used by the reference implementation
         auto copy = data;
 
@@ -124,7 +127,15 @@ int main() {
         return 111;
     }
 
-    auto data = generate_input();
+    // get the seed
+    const char *seed_str = std::getenv("POPCORN_SEED");
+    int seed = 42;
+    if (seed_str) {
+        seed = std::stoi(output_fd);
+    }
+
+    std::mt19937 rng(seed);
+    auto data = generate_input(rng());
     auto reference_output = ref_kernel(data);
     auto submission_output = custom_kernel(data);
 
@@ -133,6 +144,6 @@ int main() {
         return 112;
     }
 
-    measure_runtime(logger);
+    measure_runtime(logger, rng);
     return 0;
 }
diff --git a/src/discord-cluster-manager/eval.py b/src/discord-cluster-manager/eval.py
@@ -16,10 +16,9 @@ def log(self, key: str, value):
         print(f"{key}: {value}\n", file=self.channel)
 
 
-def correctness() -> bool:
+def correctness(rng: torch.Generator) -> bool:
     for _ in range(10):  # check multiple times
-        inputs = generate_input()
-
+        inputs = generate_input(torch.randint(0, int(2**31), (), generator=rng).item())
         custom_output = custom_kernel(inputs)
         ref_output = ref_kernel(inputs)
 
@@ -30,22 +29,22 @@ def correctness() -> bool:
     return True
 
 
-def metric(logger: PopcornLogger):
+def metric(logger: PopcornLogger, rng: torch.Generator):
     warmup_runs = 10
     timed_runs = 100
 
     # Warmup Code
     print("warming up...")
     for _ in range(warmup_runs):
-        inputs = generate_input()
+        inputs = generate_input(torch.randint(0, int(2**31), (), generator=rng).item())
         _ = custom_kernel(inputs)
     torch.cuda.synchronize()
 
     # Timing Code
     times = []
 
     for _ in range(timed_runs):
-        inputs = generate_input()
+        inputs = generate_input(torch.randint(0, int(2**31), (), generator=rng).item())
 
         start_time = time.time()
         custom_output = custom_kernel(inputs)
@@ -82,10 +81,14 @@ def main():
         print(e, file=sys.stderr)
         exit(111)
 
-    if not correctness():
+    seed = int(os.environ.get("POPCORN_FD", 42))
+    rng = torch.Generator()
+    rng.manual_seed(seed)
+
+    if not correctness(rng):
         logger.log("check", "fail")
         exit(112)
-    metric(logger)
+    metric(logger, rng)
 
 
 if __name__ == "__main__":
diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
@@ -63,7 +63,7 @@ def compile_cuda_script(  # # noqa: C901
         arch: Architecture to compile for. If None, uses `native`
         include_dirs: additional include directories to supply to nvcc
         verbose: whether to print progress or be silent
-
+        seed: Seed value to use for generating test cases
     Returns:
         A `CompileResult` that summarizes the compilation process.
 
@@ -125,11 +125,12 @@ def compile_cuda_script(  # # noqa: C901
     )
 
 
-def run_program(args: list[str]) -> RunResult:
+def run_program(args: list[str], seed: int) -> RunResult:
     # set up a pipe so the tester can communicate its verdict with us
     env = os.environ.copy()
     pipe_read, pipe_write = os.pipe()
     env["POPCORN_FD"] = str(pipe_write)
+    env["POPCORN_SEED"] = str(seed)
 
     execution_start_time = time.perf_counter()
     run_process = subprocess.run(
@@ -173,6 +174,7 @@ def run_cuda_script(  # # noqa: C901
     headers: dict[str, str] = None,
     arch: int = None,
     include_dirs: list[str] = None,
+    seed: int = 42,
 ) -> tuple[CompileResult, RunResult]:
     """
     Executes the provided CUDA kernel in an isolated environment
@@ -184,6 +186,7 @@ def run_cuda_script(  # # noqa: C901
             compile command.
         arch: The arch code for the compute/sm versions. If None, native arch is used.
         include_dirs: Additional include directories, e.g., for thunderkittens/cutlass etc
+        seed: Random seed to initialize the RNG for testing
 
     Returns:
         tuple[CompileResult, RunResult]: CUDA compile/eval result information
@@ -218,9 +221,6 @@ def run_cuda_script(  # # noqa: C901
                 result={},
             )
 
-        run_result = run_program(["./eval.out"])
-        return compile_result, run_result
-
     # cleaning up all source files _before_ we let the user code run, just in
     # case there's something in there that the user isn't supposed to snoop
     finally:
@@ -229,25 +229,15 @@ def run_cuda_script(  # # noqa: C901
             if os.path.exists(f):
                 os.remove(f)
 
-    if not compile_result.success:
-        return compile_result, RunResult(
-            success=False,
-            command="",
-            stdout="",
-            stderr="",
-            exit_code=-1,
-            duration=0.0,
-            result={},
-        )
-
-    run_result = run_program(["./eval.out"])
+    run_result = run_program(["./eval.out"], seed=seed)
     return compile_result, run_result
 
 
 def run_pytorch_script(  # noqa: C901
     sources: dict[str, str],
     main: str,
     arch: int = None,
+    seed: int = 42,
 ) -> RunResult:
     """
     Executes the provided PyTorch GPU kernel in an isolated environment
@@ -256,6 +246,7 @@ def run_pytorch_script(  # noqa: C901
         sources: Files to generate
         main: Which file to run. Must be one of the keys in sources.
         arch: The arch code for the compute/sm versions.
+        seed: Random seed to initialize the RNG for testing
 
     Returns:
         RunResult
@@ -266,7 +257,7 @@ def run_pytorch_script(  # noqa: C901
         # Write submission files to directory
         for source, content in sources.items():
             Path(source).write_text(content)
-        return run_program(["python", main])
+        return run_program(["python", main], seed=seed)
 
     finally:
         for f in sources.keys():