Skip to content

Commit 5b7c8c0

Browse files
jcpagadora737copybara-github
authored andcommitted
chore: Introduce MetricInfoProvider interface, and refactor metric evaluators to use this interface to provide MetricInfo
Co-authored-by: Joseph Pagadora <[email protected]> PiperOrigin-RevId: 851406110
1 parent 07bb164 commit 5b7c8c0

19 files changed

+357
-310
lines changed

src/google/adk/evaluation/eval_metrics.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
from __future__ import annotations
1616

17+
import abc
1718
from enum import Enum
1819
from typing import Optional
1920
from typing import Union
@@ -362,3 +363,12 @@ class MetricInfo(EvalBaseModel):
362363
metric_value_info: MetricValueInfo = Field(
363364
description="Information on the nature of values supported by the metric."
364365
)
366+
367+
368+
class MetricInfoProvider(abc.ABC):
369+
"""Interface for providing MetricInfo."""
370+
371+
@abc.abstractmethod
372+
def get_metric_info(self) -> MetricInfo:
373+
"""Returns MetricInfo for a given metric."""
374+
raise NotImplementedError

src/google/adk/evaluation/final_response_match_v1.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -23,10 +23,6 @@
2323
from .eval_case import ConversationScenario
2424
from .eval_case import Invocation
2525
from .eval_metrics import EvalMetric
26-
from .eval_metrics import Interval
27-
from .eval_metrics import MetricInfo
28-
from .eval_metrics import MetricValueInfo
29-
from .eval_metrics import PrebuiltMetrics
3026
from .evaluator import EvalStatus
3127
from .evaluator import EvaluationResult
3228
from .evaluator import Evaluator
@@ -42,20 +38,6 @@ class RougeEvaluator(Evaluator):
4238
def __init__(self, eval_metric: EvalMetric):
4339
self._eval_metric = eval_metric
4440

45-
@staticmethod
46-
def get_metric_info() -> MetricInfo:
47-
return MetricInfo(
48-
metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE.value,
49-
description=(
50-
"This metric evaluates if the agent's final response matches a"
51-
" golden/expected final response using Rouge_1 metric. Value range"
52-
" for this metric is [0,1], with values closer to 1 more desirable."
53-
),
54-
metric_value_info=MetricValueInfo(
55-
interval=Interval(min_value=0.0, max_value=1.0)
56-
),
57-
)
58-
5941
@override
6042
def evaluate_invocations(
6143
self,

src/google/adk/evaluation/final_response_match_v2.py

Lines changed: 0 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -26,11 +26,7 @@
2626
from .eval_case import Invocation
2727
from .eval_metrics import EvalMetric
2828
from .eval_metrics import EvalStatus
29-
from .eval_metrics import Interval
3029
from .eval_metrics import LlmAsAJudgeCriterion
31-
from .eval_metrics import MetricInfo
32-
from .eval_metrics import MetricValueInfo
33-
from .eval_metrics import PrebuiltMetrics
3430
from .evaluator import EvaluationResult
3531
from .evaluator import PerInvocationResult
3632
from .llm_as_judge import AutoRaterScore
@@ -154,20 +150,6 @@ def __init__(
154150
)
155151
self._auto_rater_prompt_template = _FINAL_RESPONSE_MATCH_V2_PROMPT
156152

157-
@staticmethod
158-
def get_metric_info() -> MetricInfo:
159-
return MetricInfo(
160-
metric_name=PrebuiltMetrics.FINAL_RESPONSE_MATCH_V2.value,
161-
description=(
162-
"This metric evaluates if the agent's final response matches a"
163-
" golden/expected final response using LLM as a judge. Value range"
164-
" for this metric is [0,1], with values closer to 1 more desirable."
165-
),
166-
metric_value_info=MetricValueInfo(
167-
interval=Interval(min_value=0.0, max_value=1.0)
168-
),
169-
)
170-
171153
@override
172154
def format_auto_rater_prompt(
173155
self,

src/google/adk/evaluation/hallucinations_v1.py

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,6 @@
4040
from .eval_case import InvocationEvents
4141
from .eval_metrics import EvalMetric
4242
from .eval_metrics import HallucinationsCriterion
43-
from .eval_metrics import Interval
44-
from .eval_metrics import MetricInfo
45-
from .eval_metrics import MetricValueInfo
46-
from .eval_metrics import PrebuiltMetrics
4743
from .evaluator import EvalStatus
4844
from .evaluator import EvaluationResult
4945
from .evaluator import Evaluator
@@ -310,21 +306,6 @@ def _setup_auto_rater(self) -> BaseLlm:
310306
llm_class = llm_registry.resolve(model_id)
311307
return llm_class(model=model_id)
312308

313-
@staticmethod
314-
def get_metric_info() -> MetricInfo:
315-
return MetricInfo(
316-
metric_name=PrebuiltMetrics.HALLUCINATIONS_V1.value,
317-
description=(
318-
"This metric assesses whether a model response contains any false,"
319-
" contradictory, or unsupported claims using a LLM as judge. Value"
320-
" range for this metric is [0,1], with values closer to 1 more"
321-
" desirable."
322-
),
323-
metric_value_info=MetricValueInfo(
324-
interval=Interval(min_value=0.0, max_value=1.0)
325-
),
326-
)
327-
328309
def _create_context_for_step(
329310
self,
330311
app_details: Optional[AppDetails],

src/google/adk/evaluation/metric_evaluator_registry.py

Lines changed: 19 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,14 @@
2424
from .evaluator import Evaluator
2525
from .final_response_match_v2 import FinalResponseMatchV2Evaluator
2626
from .hallucinations_v1 import HallucinationsV1Evaluator
27+
from .metric_info_providers import FinalResponseMatchV2EvaluatorMetricInfoProvider
28+
from .metric_info_providers import HallucinationsV1EvaluatorMetricInfoProvider
29+
from .metric_info_providers import PerTurnUserSimulatorQualityV1MetricInfoProvider
30+
from .metric_info_providers import ResponseEvaluatorMetricInfoProvider
31+
from .metric_info_providers import RubricBasedFinalResponseQualityV1EvaluatorMetricInfoProvider
32+
from .metric_info_providers import RubricBasedToolUseV1EvaluatorMetricInfoProvider
33+
from .metric_info_providers import SafetyEvaluatorV1MetricInfoProvider
34+
from .metric_info_providers import TrajectoryEvaluatorMetricInfoProvider
2735
from .response_evaluator import ResponseEvaluator
2836
from .rubric_based_final_response_quality_v1 import RubricBasedFinalResponseQualityV1Evaluator
2937
from .rubric_based_tool_use_quality_v1 import RubricBasedToolUseV1Evaluator
@@ -91,44 +99,44 @@ def _get_default_metric_evaluator_registry() -> MetricEvaluatorRegistry:
9199
metric_evaluator_registry = MetricEvaluatorRegistry()
92100

93101
metric_evaluator_registry.register_evaluator(
94-
metric_info=TrajectoryEvaluator.get_metric_info(),
102+
metric_info=TrajectoryEvaluatorMetricInfoProvider().get_metric_info(),
95103
evaluator=TrajectoryEvaluator,
96104
)
97105

98106
metric_evaluator_registry.register_evaluator(
99-
metric_info=ResponseEvaluator.get_metric_info(
107+
metric_info=ResponseEvaluatorMetricInfoProvider(
100108
PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value
101-
),
109+
).get_metric_info(),
102110
evaluator=ResponseEvaluator,
103111
)
104112
metric_evaluator_registry.register_evaluator(
105-
metric_info=ResponseEvaluator.get_metric_info(
113+
metric_info=ResponseEvaluatorMetricInfoProvider(
106114
PrebuiltMetrics.RESPONSE_MATCH_SCORE.value
107-
),
115+
).get_metric_info(),
108116
evaluator=ResponseEvaluator,
109117
)
110118
metric_evaluator_registry.register_evaluator(
111-
metric_info=SafetyEvaluatorV1.get_metric_info(),
119+
metric_info=SafetyEvaluatorV1MetricInfoProvider().get_metric_info(),
112120
evaluator=SafetyEvaluatorV1,
113121
)
114122
metric_evaluator_registry.register_evaluator(
115-
metric_info=FinalResponseMatchV2Evaluator.get_metric_info(),
123+
metric_info=FinalResponseMatchV2EvaluatorMetricInfoProvider().get_metric_info(),
116124
evaluator=FinalResponseMatchV2Evaluator,
117125
)
118126
metric_evaluator_registry.register_evaluator(
119-
metric_info=RubricBasedFinalResponseQualityV1Evaluator.get_metric_info(),
127+
metric_info=RubricBasedFinalResponseQualityV1EvaluatorMetricInfoProvider().get_metric_info(),
120128
evaluator=RubricBasedFinalResponseQualityV1Evaluator,
121129
)
122130
metric_evaluator_registry.register_evaluator(
123-
metric_info=HallucinationsV1Evaluator.get_metric_info(),
131+
metric_info=HallucinationsV1EvaluatorMetricInfoProvider().get_metric_info(),
124132
evaluator=HallucinationsV1Evaluator,
125133
)
126134
metric_evaluator_registry.register_evaluator(
127-
metric_info=RubricBasedToolUseV1Evaluator.get_metric_info(),
135+
metric_info=RubricBasedToolUseV1EvaluatorMetricInfoProvider().get_metric_info(),
128136
evaluator=RubricBasedToolUseV1Evaluator,
129137
)
130138
metric_evaluator_registry.register_evaluator(
131-
metric_info=PerTurnUserSimulatorQualityV1.get_metric_info(),
139+
metric_info=PerTurnUserSimulatorQualityV1MetricInfoProvider().get_metric_info(),
132140
evaluator=PerTurnUserSimulatorQualityV1,
133141
)
134142

Lines changed: 185 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,185 @@
1+
# Copyright 2025 Google LLC
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
15+
from __future__ import annotations
16+
17+
from .eval_metrics import Interval
18+
from .eval_metrics import MetricInfo
19+
from .eval_metrics import MetricInfoProvider
20+
from .eval_metrics import MetricValueInfo
21+
from .eval_metrics import PrebuiltMetrics
22+
23+
24+
class TrajectoryEvaluatorMetricInfoProvider(MetricInfoProvider):
25+
"""Metric info provider for TrajectoryEvaluator."""
26+
27+
def get_metric_info(self) -> MetricInfo:
28+
return MetricInfo(
29+
metric_name=PrebuiltMetrics.TOOL_TRAJECTORY_AVG_SCORE.value,
30+
description=(
31+
"This metric compares two tool call trajectories (expected vs."
32+
" actual) for the same user interaction. It performs an exact match"
33+
" on the tool name and arguments for each step in the trajectory."
34+
" A score of 1.0 indicates a perfect match, while 0.0 indicates a"
35+
" mismatch. Higher values are better."
36+
),
37+
metric_value_info=MetricValueInfo(
38+
interval=Interval(min_value=0.0, max_value=1.0)
39+
),
40+
)
41+
42+
43+
class ResponseEvaluatorMetricInfoProvider(MetricInfoProvider):
44+
"""Metric info provider for ResponseEvaluator."""
45+
46+
def __init__(self, metric_name: str):
47+
self._metric_name = metric_name
48+
49+
def get_metric_info(self) -> MetricInfo:
50+
"""Returns MetricInfo for the given metric name."""
51+
if PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value == self._metric_name:
52+
return MetricInfo(
53+
metric_name=PrebuiltMetrics.RESPONSE_EVALUATION_SCORE.value,
54+
description=(
55+
"This metric evaluates how coherent agent's response was. Value"
56+
" range of this metric is [1,5], with values closer to 5 more"
57+
" desirable."
58+
),
59+
metric_value_info=MetricValueInfo(
60+
interval=Interval(min_value=1.0, max_value=5.0)
61+
),
62+
)
63+
elif PrebuiltMetrics.RESPONSE_MATCH_SCORE.value == self._metric_name:
64+
return MetricInfo(
65+
metric_name=PrebuiltMetrics.RESPONSE_MATCH_SCORE.value,
66+
description=(
67+
"This metric evaluates if the agent's final response matches a"
68+
" golden/expected final response using Rouge_1 metric. Value"
69+
" range for this metric is [0,1], with values closer to 1 more"
70+
" desirable."
71+
),
72+
metric_value_info=MetricValueInfo(
73+
interval=Interval(min_value=0.0, max_value=1.0)
74+
),
75+
)
76+
else:
77+
raise ValueError(f"`{self._metric_name}` is not supported.")
78+
79+
80+
class SafetyEvaluatorV1MetricInfoProvider(MetricInfoProvider):
81+
"""Metric info provider for SafetyEvaluatorV1."""
82+
83+
def get_metric_info(self) -> MetricInfo:
84+
return MetricInfo(
85+
metric_name=PrebuiltMetrics.SAFETY_V1.value,
86+
description=(
87+
"This metric evaluates the safety (harmlessness) of an Agent's"
88+
" Response. Value range of the metric is [0, 1], with values closer"
89+
" to 1 to be more desirable (safe)."
90+
),
91+
metric_value_info=MetricValueInfo(
92+
interval=Interval(min_value=0.0, max_value=1.0)
93+
),
94+
)
95+
96+
97+
class FinalResponseMatchV2EvaluatorMetricInfoProvider(MetricInfoProvider):
98+
"""Metric info provider for FinalResponseMatchV2Evaluator."""
99+
100+
def get_metric_info(self) -> MetricInfo:
101+
return MetricInfo(
102+
metric_name=PrebuiltMetrics.FINAL_RESPONSE_MATCH_V2.value,
103+
description=(
104+
"This metric evaluates if the agent's final response matches a"
105+
" golden/expected final response using LLM as a judge. Value range"
106+
" for this metric is [0,1], with values closer to 1 more desirable."
107+
),
108+
metric_value_info=MetricValueInfo(
109+
interval=Interval(min_value=0.0, max_value=1.0)
110+
),
111+
)
112+
113+
114+
class RubricBasedFinalResponseQualityV1EvaluatorMetricInfoProvider(
115+
MetricInfoProvider
116+
):
117+
"""Metric info provider for RubricBasedFinalResponseQualityV1Evaluator."""
118+
119+
def get_metric_info(self) -> MetricInfo:
120+
return MetricInfo(
121+
metric_name=PrebuiltMetrics.RUBRIC_BASED_FINAL_RESPONSE_QUALITY_V1.value,
122+
description=(
123+
"This metric assess if the agent's final response against a set of"
124+
" rubrics using LLM as a judge. Value range for this metric is"
125+
" [0,1], with values closer to 1 more desirable."
126+
),
127+
metric_value_info=MetricValueInfo(
128+
interval=Interval(min_value=0.0, max_value=1.0)
129+
),
130+
)
131+
132+
133+
class HallucinationsV1EvaluatorMetricInfoProvider(MetricInfoProvider):
134+
"""Metric info provider for HallucinationsV1Evaluator."""
135+
136+
def get_metric_info(self) -> MetricInfo:
137+
return MetricInfo(
138+
metric_name=PrebuiltMetrics.HALLUCINATIONS_V1.value,
139+
description=(
140+
"This metric assesses whether a model response contains any false,"
141+
" contradictory, or unsupported claims using a LLM as judge. Value"
142+
" range for this metric is [0,1], with values closer to 1 more"
143+
" desirable."
144+
),
145+
metric_value_info=MetricValueInfo(
146+
interval=Interval(min_value=0.0, max_value=1.0)
147+
),
148+
)
149+
150+
151+
class RubricBasedToolUseV1EvaluatorMetricInfoProvider(MetricInfoProvider):
152+
"""Metric info provider for RubricBasedToolUseV1Evaluator."""
153+
154+
def get_metric_info(self) -> MetricInfo:
155+
return MetricInfo(
156+
metric_name=PrebuiltMetrics.RUBRIC_BASED_TOOL_USE_QUALITY_V1.value,
157+
description=(
158+
"This metric assess if the agent's usage of tools against a set of"
159+
" rubrics using LLM as a judge. Value range for this metric is"
160+
" [0,1], with values closer to 1 more desirable."
161+
),
162+
metric_value_info=MetricValueInfo(
163+
interval=Interval(min_value=0.0, max_value=1.0)
164+
),
165+
)
166+
167+
168+
class PerTurnUserSimulatorQualityV1MetricInfoProvider(MetricInfoProvider):
169+
"""Metric info provider for PerTurnUserSimulatorQualityV1."""
170+
171+
def get_metric_info(self) -> MetricInfo:
172+
return MetricInfo(
173+
metric_name=PrebuiltMetrics.PER_TURN_USER_SIMULATOR_QUALITY_V1,
174+
description=(
175+
"This metric evaluates if the user messages generated by a "
176+
"user simulator follow the given conversation scenario. It "
177+
"validates each message separately. The resulting metric "
178+
"computes the percentage of user messages that we mark as "
179+
"valid. The value range for this metric is [0,1], with values "
180+
"closer to 1 more desirable. "
181+
),
182+
metric_value_info=MetricValueInfo(
183+
interval=Interval(min_value=0.0, max_value=1.0)
184+
),
185+
)

0 commit comments

Comments
 (0)