Skip to content

Commit 7c233d3

Browse files
alexzhang13ngc92
authored andcommitted
fix db support for new gh/modal runners and verifiers
fix verify cogs to support new return types
1 parent 1d9a56b commit 7c233d3

File tree

5 files changed

+50
-70
lines changed

5 files changed

+50
-70
lines changed

src/discord-cluster-manager/cogs/github_cog.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,12 +44,12 @@ async def run_github(
4444
gpu_type: app_commands.Choice[str],
4545
reference_script: discord.Attachment = None,
4646
reference_code: str = None,
47-
) -> discord.Thread:
47+
) -> tuple[discord.Thread, FullResult]:
4848
if not script.filename.endswith((".py", ".cu", ".cuh", ".cpp")):
4949
await send_discord_message(
5050
interaction, "Please provide a Python (.py) or CUDA (.cu / .cuh / .cpp) file"
5151
)
52-
return None
52+
return None, None
5353

5454
thread = await self.bot.create_thread(interaction, gpu_type.name, "GitHub Job")
5555
await thread.send(f"Processing `{script.filename}` with {gpu_type.name}...")
@@ -98,7 +98,7 @@ async def run_github(
9898
"Failed to trigger GitHub Action. Please check the configuration."
9999
)
100100

101-
return thread
101+
return thread, result
102102

103103
except Exception as e:
104104
logger.error(f"Error processing request: {str(e)}", exc_info=True)

src/discord-cluster-manager/cogs/leaderboard_cog.py

Lines changed: 28 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,6 @@
1717
from ui.misc import DeleteConfirmationModal, GPUSelectionView
1818
from ui.table import create_table
1919
from utils import (
20-
extract_score,
2120
get_user_from_id,
2221
send_discord_message,
2322
setup_logging,
@@ -44,7 +43,7 @@ async def async_submit_cog_job(
4443
runner_name: str = "GitHub",
4544
):
4645
try:
47-
discord_thread = await command.callback(
46+
discord_thread, result = await command.callback(
4847
cog,
4948
interaction,
5049
script,
@@ -58,55 +57,40 @@ async def async_submit_cog_job(
5857
print(f"Webhook not found: {e}")
5958
await send_discord_message(interaction, "❌ The webhook was not found.")
6059

61-
message_contents = [msg.content async for msg in discord_thread.history(limit=None)]
62-
6360
try:
64-
# For CUDA leaderboards, make more robust
65-
if "check_implementation failed" in message_contents:
66-
await send_discord_message(
67-
interaction,
68-
"check_implementation failed. User kernel and reference kernel do not match.",
69-
ephemeral=True,
70-
)
71-
return
72-
73-
# TODO: Make this more robust later
74-
score = extract_score("".join(message_contents))
61+
if result.success:
62+
score = float(result.run.result["duration.mean"]) / 1e9
63+
64+
with self.bot.leaderboard_db as db:
65+
db.create_submission(
66+
{
67+
"submission_name": script.filename,
68+
"submission_time": datetime.now(),
69+
"leaderboard_name": leaderboard_name,
70+
"code": submission_content,
71+
"user_id": interaction.user.id,
72+
"submission_score": score,
73+
"gpu_type": gpu.name,
74+
}
75+
)
7576

76-
with self.bot.leaderboard_db as db:
77-
db.create_submission(
78-
{
79-
"submission_name": script.filename,
80-
"submission_time": datetime.now(),
81-
"leaderboard_name": leaderboard_name,
82-
"code": submission_content,
83-
"user_id": interaction.user.id,
84-
"submission_score": score,
85-
"gpu_type": gpu.name,
86-
}
77+
user_id = (
78+
interaction.user.global_name
79+
if interaction.user.nick is None
80+
else interaction.user.nick
8781
)
8882

89-
user_id = (
90-
interaction.user.global_name
91-
if interaction.user.nick is None
92-
else interaction.user.nick
93-
)
94-
95-
await send_discord_message(
96-
interaction,
97-
f"Successfully ran on {gpu.name} using {runner_name} runners!\n"
98-
+ f"Leaderboard '{leaderboard_name}'.\n"
99-
+ f"Submission title: {script.filename}.\n"
100-
+ f"Submission user: {user_id}.\n"
101-
+ f"Runtime: {score:.9f} seconds.",
102-
ephemeral=True,
103-
)
83+
await discord_thread.send(
84+
f"Successfully ran on {gpu.name} using {runner_name} runners!\n"
85+
+ f"Leaderboard '{leaderboard_name}'.\n"
86+
+ f"Submission title: {script.filename}.\n"
87+
+ f"Submission user: {user_id}.\n"
88+
+ f"Runtime: {score:.9f} seconds.",
89+
)
10490
except Exception:
105-
await send_discord_message(
106-
interaction,
91+
await discord_thread.send(
10792
f"Leaderboard submission to '{leaderboard_name}' on {gpu.name} "
10893
+ f"using {runner_name} runners failed!\n",
109-
ephemeral=True,
11094
)
11195

11296
async def select_gpu_view(

src/discord-cluster-manager/cogs/modal_cog.py

Lines changed: 13 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,8 @@
88
from discord.ext import commands
99
from leaderboard_eval import cu_eval, py_eval
1010
from report import generate_report
11-
from utils import send_discord_message, send_logs, setup_logging
11+
from run_eval import FullResult
12+
from utils import send_discord_message, setup_logging
1213

1314
logger = setup_logging()
1415

@@ -34,7 +35,7 @@ async def run_modal(
3435
gpu_type: app_commands.Choice[str],
3536
reference_script: Optional[discord.Attachment] = None,
3637
reference_code: str = None,
37-
) -> discord.Thread:
38+
) -> tuple[discord.Thread, FullResult]:
3839
thread = None
3940
status_msg = None
4041
try:
@@ -44,11 +45,11 @@ async def run_modal(
4445
"Please provide a Python (.py) or CUDA (.cu / .cuh / .cpp) file",
4546
ephemeral=True,
4647
)
47-
return None
48+
return None, None
4849

49-
# TODO: Maybe find a better way?
5050
if not interaction.response.is_done():
5151
await interaction.response.defer(ephemeral=True)
52+
5253
channel = interaction.channel
5354
message = await channel.send(f"Starting Modal job with {gpu_type.name}...")
5455
thread = await message.create_thread(name=f"{gpu_type.name} Modal Job")
@@ -67,7 +68,7 @@ async def run_modal(
6768
else (await reference_script.read()).decode("utf-8")
6869
)
6970

70-
await self.handle_modal_execution(
71+
result = await self.handle_modal_execution(
7172
interaction,
7273
thread,
7374
script_content,
@@ -76,7 +77,7 @@ async def run_modal(
7677
reference_content,
7778
status_msg,
7879
)
79-
return thread
80+
return thread, result
8081

8182
except Exception as e:
8283
logger.error(f"Error processing request: {str(e)}", exc_info=True)
@@ -94,7 +95,7 @@ async def handle_modal_execution(
9495
gpu_type: str,
9596
reference_content: Optional[str],
9697
status_msg: discord.Message,
97-
):
98+
) -> FullResult:
9899
try:
99100
loop = asyncio.get_event_loop()
100101
func_type = "pytorch" if filename.endswith(".py") else "cuda"
@@ -113,9 +114,11 @@ async def handle_modal_execution(
113114
# Send results
114115
await thread.send(f"\n**Script size:** {len(script_content)} bytes")
115116
await generate_report(thread, result)
117+
return result
116118

117119
else:
118-
result, score = await loop.run_in_executor(
120+
# Currently broken?
121+
result = await loop.run_in_executor(
119122
None,
120123
lambda: modal.Function.lookup("discord-bot-runner", func_name).remote(
121124
script_content,
@@ -127,18 +130,10 @@ async def handle_modal_execution(
127130

128131
# Send results
129132
await thread.send(f"\n**Script size:** {len(script_content)} bytes")
130-
await thread.send(f"**Execution time:** {score:.3f} s\n")
131-
132-
if "check_implementation failed" in result or "Error" in result:
133-
await thread.send("Modal run failed.\n")
134-
await send_logs(thread, result)
135-
await status_msg.edit(content="**Running on Modal...**\n> ❌ Job failed!")
136-
return result, 0
137-
138-
if result is not None:
139-
await thread.send(f"**score:{score:.9f}**\n```")
133+
await thread.send(f"**Execution time:** {result.run.duration:.3f} s\n")
140134

141135
await status_msg.edit(content="**Running on Modal...**\n> ✅ Job completed!")
136+
return result
142137

143138
except Exception as e:
144139
logger.error(f"Error in handle_modal_execution: {str(e)}", exc_info=True)

src/discord-cluster-manager/cogs/verify_run_cog.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ async def verify_github_run(
5555
)
5656
ref_code = Path("examples/identity_cuda/reference.cuh").read_text()
5757

58-
github_thread = await github_command.callback(
58+
github_thread, result = await github_command.callback(
5959
github_cog, interaction, sub_code, choice, reference_code=ref_code
6060
)
6161

@@ -89,7 +89,8 @@ async def verify_github_run(
8989
]
9090
await send_discord_message(
9191
interaction,
92-
f"❌ GitHub run ({choice.name}) for {lang} verification failed. Missing expected messages:\n"
92+
f"❌ GitHub run ({choice.name}) for {lang} verification failed. "
93+
+ "Missing expected messages:\n"
9394
+ "\n".join(f"- {pattern}" for pattern in missing_patterns),
9495
)
9596
return False
@@ -111,7 +112,7 @@ async def verify_modal_run(
111112
)
112113
ref_code = Path("examples/identity_cuda/reference.cuh").read_text()
113114

114-
modal_thread = await modal_command.callback(
115+
modal_thread, result = await modal_command.callback(
115116
modal_cog, interaction, sub_code, t4, reference_code=ref_code
116117
)
117118

src/discord-cluster-manager/report.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ async def _send_split_log(thread: discord.Thread, partial_message: str, header:
3838
return ""
3939

4040

41-
async def generate_report(thread: discord.Thread, result: FullResult):
41+
async def generate_report(thread: discord.Thread, result: FullResult): # noqa: C901
4242
message = ""
4343
if not result.success:
4444
message += "# Failure\n"
@@ -106,7 +106,7 @@ async def generate_report(thread: discord.Thread, result: FullResult):
106106
if len(message) != 0:
107107
await thread.send(message)
108108

109-
# TODO dedicated "error" entry in our results dict that gets populated by check_implementation
109+
# TODO dedicated "error" entry in our results that gets populated by check_implementation
110110
return
111111

112112
# OK, we were successful

0 commit comments

Comments
 (0)