From d232d4713ebc4ba756db602cb354b96922cabbaa Mon Sep 17 00:00:00 2001
From: Qiong Wu <qiowu@microsoft.com>
Date: Mon, 15 Jun 2026 09:41:45 +0800
Subject: [PATCH] fix(sa-eval): run cleanup on SKIP_* / exception paths

evaluate_model has 5 early-return paths via _skip_result that bypassed the trailing cleanup_onnx_artifacts call, so any model that failed mid-pipeline (e.g. SKIP_GRAPH_OPT for LLMs whose graph_optimize step aborts) leaked multi-GB ONNX external-data files. One Qwen2.5-1.5B SKIP_GRAPH_OPT folder alone leaked 22.9 GB.

Move the cleanup invocation from evaluate_model's success path into a try/finally in the main loop so --cleanup runs regardless of which return path is taken (SKIP_BUILD / SKIP_EXPORT / SKIP_GRAPH_OPT / SKIP_SA_PRE / SKIP_SA_POST or exception).
---
 scripts/e2e_eval/run_sa_eval.py | 40 ++++++++++++++++-----------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/scripts/e2e_eval/run_sa_eval.py b/scripts/e2e_eval/run_sa_eval.py
index ccf6edbb3..fc9bd671b 100644
--- a/scripts/e2e_eval/run_sa_eval.py
+++ b/scripts/e2e_eval/run_sa_eval.py
@@ -673,7 +673,6 @@ def evaluate_model(
     run_quantize: bool = True,
     quantize_precision: str = "int8",
     run_compile: bool = True,
-    cleanup: bool = False,
 ) -> dict | None:
     """Run the winml build + SA analysis pipeline for a single model."""
     hf_id = model_entry["hf_id"]
@@ -900,9 +899,6 @@ def _fmt(p: dict | None) -> str:
     out_file.write_text(json.dumps(result, indent=2, ensure_ascii=False), encoding="utf-8")
     safe_print(f"  Written: {out_file}")
 
-    if cleanup:
-        cleanup_onnx_artifacts(model_dir)
-
     return result
 
 
@@ -1319,22 +1315,26 @@ def main() -> None:
                 pass  # Corrupted result file — re-run
 
         safe_print(f"\n[{i}/{len(models_to_run)}]")
-        result = evaluate_model(
-            entry,
-            output_dir,
-            use_cache=args.use_cache,
-            ep=args.ep,
-            device=args.device,
-            run_perf=not args.no_perf,
-            perf_iterations=args.perf_iterations,
-            perf_warmup=args.perf_warmup,
-            run_quantize=not args.no_quantize,
-            quantize_precision=args.quantize_precision,
-            run_compile=not args.no_compile,
-            cleanup=args.cleanup,
-        )
-        if result:
-            all_results.append(result)
+        try:
+            result = evaluate_model(
+                entry,
+                output_dir,
+                use_cache=args.use_cache,
+                ep=args.ep,
+                device=args.device,
+                run_perf=not args.no_perf,
+                perf_iterations=args.perf_iterations,
+                perf_warmup=args.perf_warmup,
+                run_quantize=not args.no_quantize,
+                quantize_precision=args.quantize_precision,
+                run_compile=not args.no_compile,
+            )
+            if result:
+                all_results.append(result)
+        finally:
+            # Cleanup must run on SKIP_* / exception paths too, not just success.
+            if args.cleanup:
+                cleanup_onnx_artifacts(model_dir)
 
     elapsed = time.monotonic() - t_start
     complete = [r for r in all_results if r.get("status") == "COMPLETE"]