Skip to content

Commit 5db0450

Browse files
authored
Merge pull request #105 from kalibr-ai/claude/add-kalibr-auto-scoring-dmoZp
Add continuous scoring and default heuristic scoring to Router
2 parents d073841 + db40548 commit 5db0450

2 files changed

Lines changed: 297 additions & 8 deletions

File tree

kalibr/router.py

Lines changed: 119 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -50,16 +50,25 @@ class Router:
5050
"""
5151
Routes LLM requests to the best model based on learned outcomes.
5252
53-
Example:
53+
Three scoring modes (in priority order):
54+
1. score_when: Continuous scoring (0.0-1.0). Best for quality optimization.
55+
Example: score_when=lambda out: min(1.0, len(out) / 500)
56+
2. success_when: Binary scoring (True/False). Good for pass/fail checks.
57+
Example: success_when=lambda out: "@" in out
58+
3. Default: When neither is provided, Kalibr auto-scores using heuristics
59+
(response length, structure, finish reason). Gives day-one metrics
60+
without any evaluation code.
61+
62+
Examples:
63+
# Continuous scoring (best quality signal)
5464
router = Router(
5565
goal="summarize",
5666
paths=["gpt-4o", "claude-sonnet-4-20250514"],
57-
success_when=lambda out: len(out) > 100
67+
score_when=lambda out: min(1.0, len(out) / 500)
5868
)
5969
response = router.completion(messages=[...])
6070
61-
Examples:
62-
# Simple auto-reporting
71+
# Binary auto-reporting
6372
router = Router(
6473
goal="extract_email",
6574
paths=["gpt-4o", "claude-sonnet-4-20250514"],
@@ -68,6 +77,14 @@ class Router:
6877
response = router.completion(messages=[...])
6978
# report() called automatically
7079
80+
# Zero-config (default heuristic scoring)
81+
router = Router(
82+
goal="chat",
83+
paths=["gpt-4o", "claude-sonnet-4-20250514"]
84+
)
85+
response = router.completion(messages=[...])
86+
# Auto-scored using heuristics - no evaluation code needed
87+
7188
# Manual reporting for complex validation
7289
router = Router(
7390
goal="book_meeting",
@@ -88,12 +105,22 @@ def __init__(
88105
goal: str,
89106
paths: Optional[List[PathSpec]] = None,
90107
success_when: Optional[Callable[[str], bool]] = None,
108+
score_when: Optional[Callable[[str], float]] = None,
91109
exploration_rate: Optional[float] = None,
92110
auto_register: bool = True,
93111
):
94112
"""
95113
Initialize router.
96114
115+
Three scoring modes (in priority order):
116+
1. score_when: Continuous scoring (0.0-1.0). Best for quality optimization.
117+
Example: score_when=lambda out: min(1.0, len(out) / 500)
118+
2. success_when: Binary scoring (True/False). Good for pass/fail checks.
119+
Example: success_when=lambda out: "@" in out
120+
3. Default: When neither is provided, Kalibr auto-scores using heuristics
121+
(response length, structure, finish reason). Gives day-one metrics
122+
without any evaluation code.
123+
97124
Args:
98125
goal: Name of the goal (e.g., "book_meeting", "summarize")
99126
paths: List of models or path configs. Examples:
@@ -109,6 +136,11 @@ def __init__(
109136
Examples:
110137
success_when=lambda out: len(out) > 0 # Not empty
111138
success_when=lambda out: "@" in out # Contains email
139+
score_when: Optional function to auto-evaluate quality from LLM output.
140+
Takes the output string and returns a float (0.0-1.0).
141+
Takes priority over success_when if both are provided.
142+
Examples:
143+
score_when=lambda out: min(1.0, len(out) / 500)
112144
exploration_rate: Override exploration rate (0.0-1.0)
113145
auto_register: If True, register paths on init
114146
"""
@@ -130,6 +162,7 @@ def __init__(
130162
)
131163

132164
self.success_when = success_when
165+
self.score_when = score_when
133166
self.exploration_rate = exploration_rate
134167
self._last_trace_id: Optional[str] = None
135168
self._last_model_id: Optional[str] = None
@@ -293,12 +326,24 @@ def completion(
293326
# Success! Update state to reflect which model succeeded
294327
self._last_model_id = candidate_model
295328

296-
# Auto-report success if success_when provided
297-
if self.success_when and not self._outcome_reported:
329+
# Auto-report if any scoring mechanism is provided (or use defaults)
330+
if not self._outcome_reported:
298331
try:
299332
output = response.choices[0].message.content or ""
300-
success = self.success_when(output)
301-
self.report(success=success)
333+
334+
if self.score_when:
335+
# Priority 1: User-provided continuous scorer
336+
score = self.score_when(output)
337+
score = min(1.0, max(0.0, float(score)))
338+
self.report(success=score >= 0.5, score=score)
339+
elif self.success_when:
340+
# Priority 2: User-provided binary scorer
341+
success = self.success_when(output)
342+
self.report(success=success)
343+
else:
344+
# Priority 3: Default heuristic scoring (zero-config)
345+
score = self._default_score(response)
346+
self.report(success=score >= 0.5, score=score)
302347
except Exception as e:
303348
logger.warning(f"Auto-outcome evaluation failed: {e}")
304349

@@ -336,6 +381,72 @@ def completion(
336381
# Alias for common naming confusion
337382
complete = completion
338383

384+
def _default_score(self, response) -> float:
385+
"""
386+
Compute a heuristic quality score from an LLM response.
387+
Used when no success_when or score_when is provided.
388+
Gives users day-one quality metrics without writing evaluation code.
389+
390+
Signals (all normalized to 0-1, then weighted average):
391+
- non_empty: 1.0 if response has content, 0.0 if empty
392+
- length_score: normalized response length (sigmoid around 200 chars)
393+
- structure_score: bonus for JSON validity, markdown headers, bullet points
394+
- finish_reason_score: 1.0 for "stop", 0.5 for "length" (truncated), 0.0 for error
395+
"""
396+
try:
397+
content = response.choices[0].message.content or ""
398+
except (AttributeError, IndexError):
399+
return 0.0
400+
401+
# Signal 1: Non-empty (binary)
402+
non_empty = 1.0 if len(content.strip()) > 0 else 0.0
403+
if non_empty == 0.0:
404+
return 0.0 # Empty response is always 0
405+
406+
# Signal 2: Response length (sigmoid - most responses 50-2000 chars)
407+
import math
408+
char_count = len(content)
409+
# Sigmoid centered at 200 chars, gives ~0.5 at 200, ~0.95 at 1000
410+
length_score = 1.0 / (1.0 + math.exp(-0.005 * (char_count - 200)))
411+
412+
# Signal 3: Structure (JSON, markdown, lists indicate structured output)
413+
structure_score = 0.5 # baseline
414+
content_stripped = content.strip()
415+
# JSON detection
416+
if (content_stripped.startswith('{') and content_stripped.endswith('}')) or \
417+
(content_stripped.startswith('[') and content_stripped.endswith(']')):
418+
try:
419+
import json
420+
json.loads(content_stripped)
421+
structure_score = 1.0 # Valid JSON
422+
except (json.JSONDecodeError, ValueError):
423+
structure_score = 0.3 # Looks like JSON but invalid
424+
# Markdown/list detection
425+
elif any(marker in content for marker in ['## ', '- ', '* ', '1. ', '```']):
426+
structure_score = 0.8
427+
428+
# Signal 4: Finish reason
429+
try:
430+
finish_reason = response.choices[0].finish_reason
431+
if finish_reason == "stop":
432+
finish_score = 1.0
433+
elif finish_reason == "length":
434+
finish_score = 0.5 # Truncated
435+
else:
436+
finish_score = 0.3 # Unknown/error
437+
except (AttributeError, IndexError):
438+
finish_score = 0.5
439+
440+
# Weighted average
441+
score = (
442+
non_empty * 0.1 +
443+
length_score * 0.3 +
444+
structure_score * 0.3 +
445+
finish_score * 0.3
446+
)
447+
448+
return round(min(1.0, max(0.0, score)), 3)
449+
339450
def report(
340451
self,
341452
success: bool,

tests/test_router.py

Lines changed: 178 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,24 @@
11
"""Tests for Router class."""
22

33
import pytest
4+
from types import SimpleNamespace
45
from unittest.mock import patch, MagicMock
56

67
from kalibr.router import Router
78

89

10+
def _make_response(content="Hello world", finish_reason="stop"):
11+
"""Helper to create a mock OpenAI-format response."""
12+
return SimpleNamespace(
13+
choices=[
14+
SimpleNamespace(
15+
message=SimpleNamespace(content=content),
16+
finish_reason=finish_reason,
17+
)
18+
]
19+
)
20+
21+
922
class TestRouterInit:
1023
def test_basic_init(self):
1124
router = Router(goal="test", paths=["gpt-4o"], auto_register=False)
@@ -54,3 +67,168 @@ def test_double_report_warning(self):
5467
router._outcome_reported = True
5568
# Should not raise, just warn
5669
router.report(success=True)
70+
71+
72+
class TestDefaultScore:
73+
def test_empty_response_returns_zero(self):
74+
router = Router(goal="test", auto_register=False)
75+
response = _make_response(content="")
76+
assert router._default_score(response) == 0.0
77+
78+
def test_none_content_returns_zero(self):
79+
router = Router(goal="test", auto_register=False)
80+
response = _make_response(content=None)
81+
assert router._default_score(response) == 0.0
82+
83+
def test_whitespace_only_returns_zero(self):
84+
router = Router(goal="test", auto_register=False)
85+
response = _make_response(content=" \n ")
86+
assert router._default_score(response) == 0.0
87+
88+
def test_normal_text_above_half(self):
89+
router = Router(goal="test", auto_register=False)
90+
response = _make_response(content="This is a normal response with enough text to be useful." * 5)
91+
score = router._default_score(response)
92+
assert score > 0.5
93+
94+
def test_valid_json_scores_high(self):
95+
router = Router(goal="test", auto_register=False)
96+
response = _make_response(content='{"name": "Alice", "email": "[email protected]", "age": 30}')
97+
score = router._default_score(response)
98+
# Valid JSON should get structure_score=1.0
99+
assert score > 0.5
100+
101+
def test_invalid_json_scores_lower(self):
102+
router = Router(goal="test", auto_register=False)
103+
valid_response = _make_response(content='{"name": "Alice", "email": "[email protected]"}')
104+
invalid_response = _make_response(content='{name: Alice, email: broken}')
105+
valid_score = router._default_score(valid_response)
106+
invalid_score = router._default_score(invalid_response)
107+
assert valid_score > invalid_score
108+
109+
def test_markdown_gets_structure_bonus(self):
110+
router = Router(goal="test", auto_register=False)
111+
plain = _make_response(content="Just some plain text response here.")
112+
markdown = _make_response(content="## Header\n- Item 1\n- Item 2\n- Item 3")
113+
plain_score = router._default_score(plain)
114+
md_score = router._default_score(markdown)
115+
assert md_score > plain_score
116+
117+
def test_truncated_response_scores_lower(self):
118+
router = Router(goal="test", auto_register=False)
119+
text = "A decent response." * 20
120+
stopped = _make_response(content=text, finish_reason="stop")
121+
truncated = _make_response(content=text, finish_reason="length")
122+
assert router._default_score(stopped) > router._default_score(truncated)
123+
124+
def test_score_between_zero_and_one(self):
125+
router = Router(goal="test", auto_register=False)
126+
for content in ["x", "hello world", "a" * 5000, '{"key": "value"}']:
127+
score = router._default_score(_make_response(content=content))
128+
assert 0.0 <= score <= 1.0
129+
130+
def test_malformed_response_returns_zero(self):
131+
router = Router(goal="test", auto_register=False)
132+
# Response with no choices attribute
133+
assert router._default_score(SimpleNamespace()) == 0.0
134+
# Response with empty choices
135+
assert router._default_score(SimpleNamespace(choices=[])) == 0.0
136+
137+
138+
class TestScoreWhen:
139+
def test_score_when_stored(self):
140+
scorer = lambda out: 0.8
141+
router = Router(goal="test", score_when=scorer, auto_register=False)
142+
assert router.score_when is scorer
143+
144+
@patch("kalibr.router.Router._dispatch")
145+
@patch("kalibr.router.Router.report")
146+
@patch("kalibr.intelligence.decide")
147+
def test_score_when_called_with_output(self, mock_decide, mock_report, mock_dispatch):
148+
mock_decide.return_value = {"model_id": "gpt-4o", "trace_id": "abc123"}
149+
mock_dispatch.return_value = _make_response(content="test output")
150+
151+
router = Router(goal="test", score_when=lambda out: 0.75, auto_register=False)
152+
router.completion(messages=[{"role": "user", "content": "hi"}])
153+
154+
mock_report.assert_called_once_with(success=True, score=0.75)
155+
156+
@patch("kalibr.router.Router._dispatch")
157+
@patch("kalibr.router.Router.report")
158+
@patch("kalibr.intelligence.decide")
159+
def test_score_when_clamps_values(self, mock_decide, mock_report, mock_dispatch):
160+
mock_decide.return_value = {"model_id": "gpt-4o", "trace_id": "abc123"}
161+
mock_dispatch.return_value = _make_response(content="test")
162+
163+
# score_when returns value > 1.0, should be clamped
164+
router = Router(goal="test", score_when=lambda out: 1.5, auto_register=False)
165+
router.completion(messages=[{"role": "user", "content": "hi"}])
166+
167+
mock_report.assert_called_once_with(success=True, score=1.0)
168+
169+
@patch("kalibr.router.Router._dispatch")
170+
@patch("kalibr.router.Router.report")
171+
@patch("kalibr.intelligence.decide")
172+
def test_score_when_low_score_reports_failure(self, mock_decide, mock_report, mock_dispatch):
173+
mock_decide.return_value = {"model_id": "gpt-4o", "trace_id": "abc123"}
174+
mock_dispatch.return_value = _make_response(content="test")
175+
176+
router = Router(goal="test", score_when=lambda out: 0.2, auto_register=False)
177+
router.completion(messages=[{"role": "user", "content": "hi"}])
178+
179+
mock_report.assert_called_once_with(success=False, score=0.2)
180+
181+
@patch("kalibr.router.Router._dispatch")
182+
@patch("kalibr.router.Router.report")
183+
@patch("kalibr.intelligence.decide")
184+
def test_score_when_takes_priority_over_success_when(self, mock_decide, mock_report, mock_dispatch):
185+
mock_decide.return_value = {"model_id": "gpt-4o", "trace_id": "abc123"}
186+
mock_dispatch.return_value = _make_response(content="test output")
187+
188+
# Both provided - score_when should win
189+
router = Router(
190+
goal="test",
191+
score_when=lambda out: 0.9,
192+
success_when=lambda out: False, # Would report failure if used
193+
auto_register=False,
194+
)
195+
router.completion(messages=[{"role": "user", "content": "hi"}])
196+
197+
# score_when used (score=0.9 -> success=True), not success_when (which would give False)
198+
mock_report.assert_called_once_with(success=True, score=0.9)
199+
200+
201+
class TestDefaultScoringIntegration:
202+
@patch("kalibr.router.Router._dispatch")
203+
@patch("kalibr.router.Router.report")
204+
@patch("kalibr.intelligence.decide")
205+
def test_default_scoring_fires_when_no_scorers(self, mock_decide, mock_report, mock_dispatch):
206+
mock_decide.return_value = {"model_id": "gpt-4o", "trace_id": "abc123"}
207+
mock_dispatch.return_value = _make_response(content="A good response with enough content." * 5)
208+
209+
# No score_when, no success_when
210+
router = Router(goal="test", auto_register=False)
211+
router.completion(messages=[{"role": "user", "content": "hi"}])
212+
213+
# report() should have been called with a heuristic score
214+
mock_report.assert_called_once()
215+
call_kwargs = mock_report.call_args
216+
assert "score" in call_kwargs.kwargs or (len(call_kwargs.args) > 1 if call_kwargs.args else False)
217+
218+
@patch("kalibr.router.Router._dispatch")
219+
@patch("kalibr.router.Router.report")
220+
@patch("kalibr.intelligence.decide")
221+
def test_success_when_takes_priority_over_default(self, mock_decide, mock_report, mock_dispatch):
222+
mock_decide.return_value = {"model_id": "gpt-4o", "trace_id": "abc123"}
223+
mock_dispatch.return_value = _make_response(content="no-email-here")
224+
225+
# success_when provided - should use it, not default scoring
226+
router = Router(
227+
goal="test",
228+
success_when=lambda out: "@" in out,
229+
auto_register=False,
230+
)
231+
router.completion(messages=[{"role": "user", "content": "hi"}])
232+
233+
# success_when should report success=False (no @ in output), no score kwarg
234+
mock_report.assert_called_once_with(success=False)

0 commit comments

Comments
 (0)