extended test for python implementation + adjusted exit codes for consistency

ngc92 · ngc92 · commit 6d030d05af3e · 2025-01-14T15:49:26.000+02:00
diff --git a/.github/workflows/cuda_test.yml b/.github/workflows/cuda_test.yml
diff --git a/.github/workflows/runner_ci.yml b/.github/workflows/runner_ci.yml
@@ -0,0 +1,65 @@
+name: Runner CI
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  check-cuda:
+    runs-on: [gpumode-nvidia-arc]
+    timeout-minutes: 10
+    container:
+      image: nvidia/cuda:12.4.0-devel-ubuntu22.04
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Install pytest
+        shell: bash
+        run: pip install pytest
+
+      - name: Run script
+        shell: bash
+        run: pytest scripts/ci_test_cuda.py
+
+    env:
+      CUDA_VISIBLE_DEVICES: 0
+
+  check-pytorch:
+    runs-on: [gpumode-nvidia-arc]
+    timeout-minutes: 10
+    container:
+      image: nvidia/cuda:12.4.0-devel-ubuntu22.04
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v3
+        with:
+          version: "latest"
+
+      - name: Setup Python environment
+        run: |
+          uv venv .venv
+          echo "VIRTUAL_ENV=$PWD/.venv" >> $GITHUB_ENV
+          echo "$PWD/.venv/bin" >> $GITHUB_PATH
+          uv pip install numpy torch setuptools ninja pytest
+
+      - name: Run script
+        shell: bash
+        run: pytest scripts/ci_test_python.py
+
+    env:
+      CUDA_VISIBLE_DEVICES: 0
+
diff --git a/scripts/ci_test_cuda.py b/scripts/ci_test_cuda.py
@@ -55,9 +55,9 @@ def test_cuda_runtime_error():
     assert run.success is False
     assert run.command == "./eval.out"
     assert "warming up..." in run.stdout
-    assert "cudaDeviceSynchronize() at eval.cu(64) in `measure_runtime`" in run.stderr
+    assert "cudaDeviceSynchronize() at eval.cu(63) in `measure_runtime`" in run.stderr
     assert "an illegal memory access was encountered" in run.stderr
-    assert run.exit_code == 3
+    assert run.exit_code == 110
     assert len(run.result) == 0
 
 
@@ -85,7 +85,7 @@ def test_cuda_validation_fail():
     # we never reach the benchmark part, because the test fails
     assert "warming up..." not in run.stdout
     assert "ERROR AT 0, 0" in run.stderr
-    assert run.exit_code == 1
+    assert run.exit_code == 112
     assert run.result["check"] == "fail"
 
 
diff --git a/scripts/ci_test_python.py b/scripts/ci_test_python.py
@@ -0,0 +1,52 @@
+import os
+import sys
+from pathlib import Path
+
+if Path().resolve().name == "scripts":
+    os.chdir("..")
+
+sys.path.append("src/discord-cluster-manager")
+
+from leaderboard_eval import py_eval
+from run_eval import run_pytorch_script
+
+ref = Path("examples/identity_py/reference.py")
+
+
+def test_does_not_import():
+    # input_tt is a typo, so this won't compile
+    sub = """
+    this is a syntax error
+    """
+
+    run = run_pytorch_script(py_eval, ref.read_text(), sub, arch=None)
+    assert run.success is False
+    assert run.exit_code == 1
+    assert "IndentationError: unexpected indent\n" in run.stderr
+
+
+def test_error():
+    # no-op, runs fine but isn't correct
+    sub = """
+import torch
+def custom_kernel(input):
+    return [torch.zeros_like(i) for i in input]
+        """
+    run = run_pytorch_script(py_eval, ref.read_text(), sub, arch=None)
+    assert run.success is False
+    assert run.command == "python eval.py"
+    # we never reach the benchmark part, because the test fails
+    assert "warming up..." not in run.stdout
+    assert "mismatch found! custom implementation doesnt match reference." in run.stdout
+    assert run.exit_code == 112
+    assert run.result["check"] == "fail"
+
+
+def test_correct():
+    sub = Path("examples/identity_py/submission.py").read_text()
+
+    run = run_pytorch_script(py_eval, ref.read_text(), sub, arch=None)
+    assert run.success is True
+    assert "warming up..." in run.stdout
+    assert run.exit_code == 0
+    assert run.result["check"] == "pass"
diff --git a/src/discord-cluster-manager/eval.cu b/src/discord-cluster-manager/eval.cu
@@ -46,8 +46,7 @@ static void cuda_check(cudaError_t status, const char* expr, const char* file, i
                   << line << ") in `"
                   << function << "`: "
                   << cudaGetErrorString(status) << std::endl;
-        // following pytest convention, exit code 3 means internal error
-        std::exit(3);
+        std::exit(110);
     }
 }
 
@@ -83,7 +82,7 @@ void measure_runtime(PopcornOutput& logger) {
         auto reference_output = ref_kernel(copy);
         if (!check_implementation(submission_output, reference_output)) {
             logger.log("check", "fail");
-            std::exit(1);
+            std::exit(112);
         }
 
     }
@@ -122,7 +121,7 @@ int main() {
         int fd = std::stoi(output_fd);
         logger.File.reset(::fdopen(fd, "w"));
     } else {
-        return 4;       // pytest: usage error
+        return 111;
     }
 
     auto data = generate_input();
@@ -131,7 +130,7 @@ int main() {
 
     if (!check_implementation(submission_output, reference_output)) {
         logger.log("check", "fail");
-        return 1;
+        return 112;
     }
 
     measure_runtime(logger);
diff --git a/src/discord-cluster-manager/eval.py b/src/discord-cluster-manager/eval.py
@@ -1,5 +1,6 @@
 import math
 import os
+import sys
 import time
 
 import torch
@@ -56,7 +57,7 @@ def metric(logger: PopcornLogger):
         torch.cuda.synchronize()
         if not check_implementation(custom_output, ref_output):
             logger.log("check", "fail")
-            exit(1)
+            exit(112)
 
     total_time = sum(times)
     average_duration = total_time / timed_runs
@@ -75,10 +76,15 @@ def metric(logger: PopcornLogger):
 
 
 def main():
-    logger = PopcornLogger(int(os.environ["POPCORN_FD"]))
+    try:
+        logger = PopcornLogger(int(os.environ["POPCORN_FD"]))
+    except Exception as e:
+        print(e, file=sys.stderr)
+        exit(111)
+
     if not correctness():
         logger.log("check", "fail")
-        exit(1)
+        exit(112)
     metric(logger)
 
 
diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
@@ -141,10 +141,10 @@ def run_program(args: list[str]) -> RunResult:
         key, _, value = line.partition(":")
         result_dict[key.strip()] = value.strip()
 
+    # 0     everything was fine
+    # 112   program ran fine, but we detected a test failure
     return RunResult(
-        # TODO should we return 0 also on test failure?
-        # TODO check what return codes python uses, e.g. on uncaught exception
-        success=(run_process.returncode == 0 or run_process.returncode == 1),
+        success=run_process.returncode == 0,
         command=_make_cmd(run_process.args),
         stdout=run_process.stdout,
         stderr=run_process.stderr,

Original file line number	Diff line number	Diff line change
`@@ -46,8 +46,7 @@ static void cuda_check(cudaError_t status, const char* expr, const char* file, i`
`46`	`46`	<< line << ") in `"
`47`	`47`	<< function << "`: "
`48`	`48`	`<< cudaGetErrorString(status) << std::endl;`
`49`		`- // following pytest convention, exit code 3 means internal error`
`50`		`- std::exit(3);`
	`49`	`+ std::exit(110);`
`51`	`50`	`}`
`52`	`51`	`}`
`53`	`52`
`@@ -83,7 +82,7 @@ void measure_runtime(PopcornOutput& logger) {`
`83`	`82`	`auto reference_output = ref_kernel(copy);`
`84`	`83`	`if (!check_implementation(submission_output, reference_output)) {`
`85`	`84`	`logger.log("check", "fail");`
`86`		`- std::exit(1);`
	`85`	`+ std::exit(112);`
`87`	`86`	`}`
`88`	`87`
`89`	`88`	`}`
`@@ -122,7 +121,7 @@ int main() {`
`122`	`121`	`int fd = std::stoi(output_fd);`
`123`	`122`	`logger.File.reset(::fdopen(fd, "w"));`
`124`	`123`	`} else {`
`125`		`- return 4; // pytest: usage error`
	`124`	`+ return 111;`
`126`	`125`	`}`
`127`	`126`
`128`	`127`	`auto data = generate_input();`
`@@ -131,7 +130,7 @@ int main() {`
`131`	`130`
`132`	`131`	`if (!check_implementation(submission_output, reference_output)) {`
`133`	`132`	`logger.log("check", "fail");`
`134`		`- return 1;`
	`133`	`+ return 112;`
`135`	`134`	`}`
`136`	`135`
`137`	`136`	`measure_runtime(logger);`