Merge pull request #366 from algorithmicsuperintelligence/feat-add-rich-feedback-example

codelion · web-flow · commit 75bc29894b18 · 2025-12-24T10:58:18.000+05:30
Add rich feedback mode to k_module_problem example
diff --git a/examples/k_module_problem/README.md b/examples/k_module_problem/README.md
@@ -166,6 +166,25 @@ This establishes the "no learning" baseline. Any method that beats this is demon
 
 **Key insight**: While OpenEvolve takes more iterations on average (52.3 vs 13), it has a **100% success rate** compared to iterative refinement's 33%. The evolutionary approach's population diversity ensures it eventually escapes local optima that trap single-trajectory methods.
 
+### Rich Feedback Mode: Proving Attribution Matters
+
+To verify that feedback attribution is the key factor, we added a `RICH_FEEDBACK=1` mode that tells the agent exactly which modules are correct/incorrect:
+
+```bash
+RICH_FEEDBACK=1 python run_iterative_trials.py --trials 3 --iterations 100
+```
+
+| Method | Success Rate | Avg Iterations |
+|--------|-------------|----------------|
+| **Iterative (no feedback)** | 33% | 13 (when found) |
+| **Iterative (rich feedback)** | **100%** | **3** |
+
+With rich feedback, iterative refinement achieves **100% success rate in only 3 iterations** - dramatically faster than OpenEvolve's 52 iterations! This proves that:
+
+1. **Feedback attribution is the key factor**, not the optimization method
+2. When feedback is attributable, iterative refinement is highly effective
+3. Evolution is necessary when feedback is NOT attributable (you can't tell which component is wrong)
+
 ## Why This Matters
 
 This example illustrates when you should prefer evolutionary approaches:
diff --git a/examples/k_module_problem/evaluator.py b/examples/k_module_problem/evaluator.py
@@ -9,13 +9,21 @@
 This creates a challenging landscape for iterative refinement but
 allows evolutionary crossover to combine good "building blocks"
 from different individuals.
+
+Set RICH_FEEDBACK=1 to enable rich feedback mode, which tells you
+exactly which modules are correct/incorrect. This demonstrates that
+iterative refinement works well when feedback is attributable.
 """
 
+import os
 import sys
 import time
 import traceback
 import importlib.util
 
+# Rich feedback mode - when enabled, reveals which modules are correct
+RICH_FEEDBACK = os.environ.get("RICH_FEEDBACK", "0") == "1"
+
 # The correct solution (hidden from the optimizer)
 # This represents the "optimal" pipeline configuration discovered through
 # extensive testing/domain expertise
@@ -141,14 +149,34 @@ def score_config(config: dict) -> tuple:
 
 def build_artifacts(config: dict, correct_count: int, module_results: dict, eval_time: float) -> dict:
     """
-    Build artifacts that provide useful feedback without revealing
-    exactly which modules are correct.
+    Build artifacts that provide useful feedback.
+
+    In normal mode: Only reveals how many modules are correct, not which ones.
+    In rich feedback mode (RICH_FEEDBACK=1): Reveals exactly which modules are correct/incorrect.
     """
     artifacts = {}
 
     # Configuration summary
     artifacts["configuration"] = str(config)
 
+    # Rich feedback mode - reveals which modules are correct/incorrect
+    if RICH_FEEDBACK:
+        correct_modules = [m for m, is_correct in module_results.items() if is_correct]
+        incorrect_modules = [m for m, is_correct in module_results.items() if not is_correct]
+
+        artifacts["module_feedback"] = {
+            "correct": correct_modules,
+            "incorrect": incorrect_modules,
+        }
+
+        if incorrect_modules:
+            hints = []
+            for module in incorrect_modules:
+                hints.append(f"'{module}' is WRONG - try a different option from {VALID_OPTIONS[module]}")
+            artifacts["actionable_hints"] = hints
+        else:
+            artifacts["actionable_hints"] = ["All modules are correct!"]
+
     # Score feedback - tells you how many are correct, but not which ones
     if correct_count == NUM_MODULES:
         artifacts["status"] = "PERFECT! All modules correctly configured!"
diff --git a/examples/k_module_problem/iterative_agent.py b/examples/k_module_problem/iterative_agent.py
@@ -64,6 +64,26 @@ def write_program(program_path: str, code: str) -> None:
         f.write(code)
 
 
+def format_rich_feedback(artifacts: dict) -> str:
+    """Format rich feedback if available (RICH_FEEDBACK=1)."""
+    if "module_feedback" not in artifacts:
+        return ""
+
+    feedback = artifacts["module_feedback"]
+    hints = artifacts.get("actionable_hints", [])
+
+    result = "\n## DETAILED MODULE FEEDBACK (Rich Feedback Mode)\n"
+    result += f"- CORRECT modules: {feedback.get('correct', [])}\n"
+    result += f"- INCORRECT modules: {feedback.get('incorrect', [])}\n"
+
+    if hints:
+        result += "\n### Actionable Hints:\n"
+        for hint in hints:
+            result += f"- {hint}\n"
+
+    return result
+
+
 def create_improvement_prompt(
     current_code: str,
     metrics: dict,
@@ -108,6 +128,7 @@ def create_improvement_prompt(
 - Score: {metrics.get('combined_score', 0):.2%}
 - Status: {artifacts.get('status', 'N/A')}
 - Suggestion: {artifacts.get('suggestion', 'N/A')}
+{format_rich_feedback(artifacts)}
 {history_str}
 
 ## Your Task
@@ -205,7 +226,11 @@ def run_iterative_refinement(
 
         # Evaluate current program
         eval_result = evaluate(str(current_program_path))
-        metrics = eval_result.get("metrics", {})
+        # Handle both flat (success) and nested (error) return formats
+        if "metrics" in eval_result:
+            metrics = eval_result["metrics"]
+        else:
+            metrics = {k: v for k, v in eval_result.items() if k != "artifacts"}
         artifacts = eval_result.get("artifacts", {})
 
         score = metrics.get("combined_score", 0)