gpu-mode · msaroufim · Jan 18, 2026 · Jan 18, 2026 · Copilot · Jan 18, 2026
diff --git a/src/libkernelbot/launchers/modal.py b/src/libkernelbot/launchers/modal.py
@@ -30,9 +30,16 @@ async def run_submission(
 
         await status.push("⏳ Waiting for Modal run to finish...")
 
+        # Use task-specific timeout + 60s buffer for signal-based timeout
+        # This catches most hangs; container timeout is the fallback for hung GPUs
+        task_timeout = config.get("ranked_timeout", 180)
+        signal_timeout = task_timeout + 60
+
         result = await loop.run_in_executor(
             None,
-            lambda: modal.Function.from_name("discord-bot-runner", func_name).remote(config=config),
+            lambda: modal.Function.from_name("discord-bot-runner", func_name).remote(
+                config=config, timeout_seconds=signal_timeout
+            ),
         )
 
         await status.update("✅ Waiting for modal run to finish... Done")

diff --git a/src/runners/modal_runner_archs.py b/src/runners/modal_runner_archs.py
@@ -2,12 +2,16 @@
 # Modal apps on specific devices. We will fix this later.
 from modal_runner import app, cuda_image, modal_run_config
 
+# Container-level timeout (seconds) - kills container regardless of GPU state
+# This is the nuclear option for hung GPUs that don't respond to signals
+MODAL_CONTAINER_TIMEOUT = 300
+
-
-# Container-level timeout (seconds) - kills container regardless of GPU state
-# This is the nuclear option for hung GPUs that don't respond to signals
-MODAL_CONTAINER_TIMEOUT = 300
+import os
+
+# Container-level timeout (seconds) - kills container regardless of GPU state
+# This is the nuclear option for hung GPUs that don't respond to signals
+# Make this configurable and give it a higher default than any expected signal timeout.
+def _get_modal_container_timeout(default: int = 900) -> int:
+    raw = os.getenv("MODAL_CONTAINER_TIMEOUT")
+    if raw is None:
+        return default
+    try:
+        value = int(raw)
+        return value if value > 0 else default
+    except (TypeError, ValueError):
+        return default
+
+MODAL_CONTAINER_TIMEOUT = _get_modal_container_timeout()
-
-# Container-level timeout (seconds) - kills container regardless of GPU state
-# This is the nuclear option for hung GPUs that don't respond to signals
-MODAL_CONTAINER_TIMEOUT = 300
+import os
+
+# Container-level timeout (seconds) - kills container regardless of GPU state
+# This is the nuclear option for hung GPUs that don't respond to signals
+# Make this configurable and give it a higher default than any expected signal timeout.
+def _get_modal_container_timeout(default: int = 900) -> int:
+    raw = os.getenv("MODAL_CONTAINER_TIMEOUT")
+    if raw is None:
+        return default
+    try:
+        value = int(raw)
+        return value if value > 0 else default
+    except (TypeError, ValueError):
+        return default
+
+MODAL_CONTAINER_TIMEOUT = _get_modal_container_timeout()
 gpus = ["T4", "L4", "L4:4", "A100-80GB", "H100!", "B200"]
 for gpu in gpus:
     gpu_slug = gpu.lower().split("-")[0].strip("!").replace(":", "x")
-    app.function(gpu=gpu, image=cuda_image, name=f"run_cuda_script_{gpu_slug}", serialized=True)(
+    app.function(gpu=gpu, image=cuda_image, name=f"run_cuda_script_{gpu_slug}", serialized=True, timeout=MODAL_CONTAINER_TIMEOUT)(
         modal_run_config
     )
-    app.function(gpu=gpu, image=cuda_image, name=f"run_pytorch_script_{gpu_slug}", serialized=True)(
+    app.function(gpu=gpu, image=cuda_image, name=f"run_pytorch_script_{gpu_slug}", serialized=True, timeout=MODAL_CONTAINER_TIMEOUT)(
         modal_run_config
     )