Skip to content

Commit d26faaf

Browse files
committed
added an enum for exit codes
1 parent c3b2976 commit d26faaf

File tree

4 files changed

+28
-11
lines changed

4 files changed

+28
-11
lines changed

scripts/ci_test_cuda.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
sys.path.append("src/discord-cluster-manager")
99

10+
from consts import ExitCode
1011
from leaderboard_eval import cu_eval
1112
from run_eval import run_cuda_script
1213

@@ -23,6 +24,7 @@ def test_does_not_compile():
2324
assert comp.success is False
2425
assert run.success is False
2526
assert comp.nvcc_found is True
27+
assert comp.exit_code != ExitCode.SUCCESS
2628
assert comp.stdout == ""
2729
assert 'train.cuh(2): error: identifier "input_tt" is undefined' in comp.stderr
2830
assert '1 error detected in the compilation of "eval.cu".' in comp.stderr
@@ -57,7 +59,7 @@ def test_cuda_runtime_error():
5759
assert "warming up..." in run.stdout
5860
assert "cudaDeviceSynchronize() at eval.cu(63) in `measure_runtime`" in run.stderr
5961
assert "an illegal memory access was encountered" in run.stderr
60-
assert run.exit_code == 110
62+
assert run.exit_code == ExitCode.CUDA_FAIL
6163
assert len(run.result) == 0
6264

6365

@@ -85,7 +87,7 @@ def test_cuda_validation_fail():
8587
# we never reach the benchmark part, because the test fails
8688
assert "warming up..." not in run.stdout
8789
assert "ERROR AT 0, 0" in run.stderr
88-
assert run.exit_code == 112
90+
assert run.exit_code == ExitCode.VALIDATE_FAIL
8991
assert run.result["check"] == "fail"
9092

9193

@@ -96,5 +98,5 @@ def test_cuda_correct():
9698
assert comp.success is True
9799
assert run.success is True
98100
assert "warming up..." in run.stdout
99-
assert run.exit_code == 0
101+
assert run.exit_code == ExitCode.SUCCESS
100102
assert run.result["check"] == "pass"

scripts/ci_test_python.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77

88
sys.path.append("src/discord-cluster-manager")
99

10+
from consts import ExitCode
1011
from leaderboard_eval import py_eval
1112
from run_eval import run_pytorch_script
1213

@@ -21,7 +22,7 @@ def test_does_not_import():
2122

2223
run = run_pytorch_script(py_eval, ref.read_text(), sub, arch=None)
2324
assert run.success is False
24-
assert run.exit_code == 1
25+
assert run.exit_code != ExitCode.SUCCESS
2526
assert "IndentationError: unexpected indent\n" in run.stderr
2627

2728

@@ -38,7 +39,7 @@ def custom_kernel(input):
3839
# we never reach the benchmark part, because the test fails
3940
assert "warming up..." not in run.stdout
4041
assert "mismatch found! custom implementation doesnt match reference." in run.stdout
41-
assert run.exit_code == 112
42+
assert run.exit_code == ExitCode.VALIDATE_FAIL
4243
assert run.result["check"] == "fail"
4344

4445

@@ -48,5 +49,5 @@ def test_correct():
4849
run = run_pytorch_script(py_eval, ref.read_text(), sub, arch=None)
4950
assert run.success is True
5051
assert "warming up..." in run.stdout
51-
assert run.exit_code == 0
52+
assert run.exit_code == ExitCode.SUCCESS
5253
assert run.result["check"] == "pass"

src/discord-cluster-manager/consts.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from enum import Enum
1+
from enum import Enum, IntEnum
22
from typing import Type
33

44

@@ -25,6 +25,22 @@ class ModalGPU(Enum):
2525
H100 = "H100"
2626

2727

28+
class ExitCode(IntEnum):
29+
"""
30+
Exit codes for our runners. These are just the codes actively return,
31+
others are possible (e.g., exiting due to segfault, permissions, signal, ...)
32+
"""
33+
34+
# program ran successfully
35+
SUCCESS = 0
36+
# a cuda API call failed
37+
CUDA_FAIL = 110
38+
# could not setup file descriptor for custom pipe
39+
PIPE_FAILED = 111
40+
# didn't crash, but tests failed
41+
VALIDATE_FAIL = 112
42+
43+
2844
def combine_enums(enums: list[Type[Enum]], combined_name: str) -> Enum:
2945
combined_members = {}
3046
for enum in enums:

src/discord-cluster-manager/run_eval.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import time
66
from typing import Optional
77

8-
from consts import CUDA_FLAGS
8+
from consts import CUDA_FLAGS, ExitCode
99

1010

1111
@dataclasses.dataclass
@@ -141,10 +141,8 @@ def run_program(args: list[str]) -> RunResult:
141141
key, _, value = line.partition(":")
142142
result_dict[key.strip()] = value.strip()
143143

144-
# 0 everything was fine
145-
# 112 program ran fine, but we detected a test failure
146144
return RunResult(
147-
success=run_process.returncode == 0,
145+
success=run_process.returncode == ExitCode.SUCCESS,
148146
command=_make_cmd(run_process.args),
149147
stdout=run_process.stdout,
150148
stderr=run_process.stderr,

0 commit comments

Comments
 (0)