Skip to content

Commit 0918b64

Browse files
KPJoshicopybara-github
authored andcommitted
fix: fix inconsistent method signatures for evaluate_invocations
The evaluate_invocations method override in Evaluator subclasses was not consistent, leading to errors during calls, especially when using kwargs. Made the overrides and calls consistent to resolve this issue. Co-authored-by: Keyur Joshi <keyurj@google.com> PiperOrigin-RevId: 850462752
1 parent 38a30a4 commit 0918b64

File tree

11 files changed

+48
-24
lines changed

11 files changed

+48
-24
lines changed

src/google/adk/evaluation/evaluator.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,8 @@ class Evaluator(ABC):
6262
def evaluate_invocations(
6363
self,
6464
actual_invocations: list[Invocation],
65-
expected_invocations: Optional[list[Invocation]],
66-
conversation_scenario: Optional[ConversationScenario],
65+
expected_invocations: Optional[list[Invocation]] = None,
66+
conversation_scenario: Optional[ConversationScenario] = None,
6767
) -> EvaluationResult:
6868
"""Returns EvaluationResult after performing evaluations using actual and expected invocations.
6969

src/google/adk/evaluation/final_response_match_v1.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -60,11 +60,12 @@ def get_metric_info() -> MetricInfo:
6060
def evaluate_invocations(
6161
self,
6262
actual_invocations: list[Invocation],
63-
expected_invocations: Optional[list[Invocation]],
64-
_: Optional[ConversationScenario] = None,
63+
expected_invocations: Optional[list[Invocation]] = None,
64+
conversation_scenario: Optional[ConversationScenario] = None,
6565
) -> EvaluationResult:
6666
if expected_invocations is None:
6767
raise ValueError("expected_invocations is required for this metric.")
68+
del conversation_scenario # not used by this metric.
6869

6970
total_score = 0.0
7071
num_invocations = 0

src/google/adk/evaluation/hallucinations_v1.py

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -720,16 +720,19 @@ def _aggregate_invocation_results(
720720
async def evaluate_invocations(
721721
self,
722722
actual_invocations: list[Invocation],
723-
expected_invocations: Optional[list[Invocation]],
724-
_: Optional[ConversationScenario] = None,
723+
expected_invocations: Optional[list[Invocation]] = None,
724+
conversation_scenario: Optional[ConversationScenario] = None,
725725
) -> EvaluationResult:
726+
del conversation_scenario # not used by this metric.
727+
726728
# expected_invocations are not required by the metric and if they are not
727729
# supplied, we provide a list of None to rest of the code.
728730
expected_invocations = (
729731
[None] * len(actual_invocations)
730732
if expected_invocations is None
731733
else expected_invocations
732734
)
735+
733736
per_invocation_results = []
734737
for actual, expected in zip(actual_invocations, expected_invocations):
735738
step_evaluations = self._get_steps_to_evaluate(actual)

src/google/adk/evaluation/llm_as_judge.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -118,11 +118,12 @@ def aggregate_invocation_results(
118118
async def evaluate_invocations(
119119
self,
120120
actual_invocations: list[Invocation],
121-
expected_invocations: Optional[list[Invocation]],
122-
_: Optional[ConversationScenario] = None,
121+
expected_invocations: Optional[list[Invocation]] = None,
122+
conversation_scenario: Optional[ConversationScenario] = None,
123123
) -> EvaluationResult:
124124
if self._expected_invocations_required and expected_invocations is None:
125125
raise ValueError("expected_invocations is needed by this metric.")
126+
del conversation_scenario # not supported for per-invocation evaluation.
126127

127128
# If expected_invocation are not required by the metric and if they are not
128129
# supplied, we provide a list of None.

src/google/adk/evaluation/local_eval_service.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -370,6 +370,7 @@ async def _evaluate_metric(
370370
return metric_evaluator.evaluate_invocations(
371371
actual_invocations=actual_invocations,
372372
expected_invocations=expected_invocations,
373+
conversation_scenario=conversation_scenario,
373374
)
374375

375376
def _generate_final_eval_status(

src/google/adk/evaluation/response_evaluator.py

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -100,20 +100,22 @@ def get_metric_info(metric_name: str) -> MetricInfo:
100100
def evaluate_invocations(
101101
self,
102102
actual_invocations: list[Invocation],
103-
expected_invocations: Optional[list[Invocation]],
104-
_: Optional[ConversationScenario] = None,
103+
expected_invocations: Optional[list[Invocation]] = None,
104+
conversation_scenario: Optional[ConversationScenario] = None,
105105
) -> EvaluationResult:
106106
# If the metric is response_match_score, just use the RougeEvaluator.
107107
if self._metric_name == PrebuiltMetrics.RESPONSE_MATCH_SCORE.value:
108108
rouge_evaluator = RougeEvaluator(
109109
EvalMetric(metric_name=self._metric_name, threshold=self._threshold)
110110
)
111111
return rouge_evaluator.evaluate_invocations(
112-
actual_invocations, expected_invocations
112+
actual_invocations, expected_invocations, conversation_scenario
113113
)
114114

115115
return _VertexAiEvalFacade(
116116
threshold=self._threshold,
117117
metric_name=self._metric_name,
118118
expected_invocations_required=True,
119-
).evaluate_invocations(actual_invocations, expected_invocations)
119+
).evaluate_invocations(
120+
actual_invocations, expected_invocations, conversation_scenario
121+
)

src/google/adk/evaluation/safety_evaluator.py

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818

1919
from typing_extensions import override
2020

21+
from .eval_case import ConversationScenario
2122
from .eval_case import Invocation
2223
from .eval_metrics import EvalMetric
2324
from .eval_metrics import Interval
@@ -65,11 +66,14 @@ def get_metric_info() -> MetricInfo:
6566
def evaluate_invocations(
6667
self,
6768
actual_invocations: list[Invocation],
68-
expected_invocations: Optional[list[Invocation]],
69+
expected_invocations: Optional[list[Invocation]] = None,
70+
conversation_scenario: Optional[ConversationScenario] = None,
6971
) -> EvaluationResult:
7072
from ..dependencies.vertexai import vertexai
7173

7274
return _VertexAiEvalFacade(
7375
threshold=self._eval_metric.threshold,
7476
metric_name=vertexai.types.PrebuiltMetric.SAFETY,
75-
).evaluate_invocations(actual_invocations, expected_invocations)
77+
).evaluate_invocations(
78+
actual_invocations, expected_invocations, conversation_scenario
79+
)

src/google/adk/evaluation/simulation/per_turn_user_simulator_quality_v1.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -290,10 +290,12 @@ def get_metric_info() -> MetricInfo:
290290
async def evaluate_invocations(
291291
self,
292292
actual_invocations: list[Invocation],
293-
expected_invocations: Optional[list[Invocation]],
294-
conversation_scenario: Optional[ConversationScenario],
293+
expected_invocations: Optional[list[Invocation]] = None,
294+
conversation_scenario: Optional[ConversationScenario] = None,
295295
) -> EvaluationResult:
296-
del expected_invocations
296+
del expected_invocations # not used by this metric.
297+
if conversation_scenario is None:
298+
raise ValueError("conversation_scenario is needed by this metric.")
297299

298300
# Evaluate the first invocation contains the given starting prompt.
299301
results = [

src/google/adk/evaluation/trajectory_evaluator.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from pydantic import ValidationError
2323
from typing_extensions import override
2424

25+
from .eval_case import ConversationScenario
2526
from .eval_case import get_all_tool_calls
2627
from .eval_case import Invocation
2728
from .eval_metrics import EvalMetric
@@ -118,11 +119,13 @@ def get_metric_info() -> MetricInfo:
118119
def evaluate_invocations(
119120
self,
120121
actual_invocations: list[Invocation],
121-
expected_invocations: Optional[list[Invocation]],
122+
expected_invocations: Optional[list[Invocation]] = None,
123+
conversation_scenario: Optional[ConversationScenario] = None,
122124
) -> EvaluationResult:
123125
"""Returns EvaluationResult after performing evaluations using actual and expected invocations."""
124126
if expected_invocations is None:
125127
raise ValueError("expected_invocations is needed by this metric.")
128+
del conversation_scenario # not supported for per-invocation evaluation.
126129

127130
total_tool_use_accuracy = 0.0
128131
num_invocations = 0

src/google/adk/evaluation/vertex_ai_eval_facade.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -69,11 +69,12 @@ def __init__(
6969
def evaluate_invocations(
7070
self,
7171
actual_invocations: list[Invocation],
72-
expected_invocations: Optional[list[Invocation]],
73-
_: Optional[ConversationScenario] = None,
72+
expected_invocations: Optional[list[Invocation]] = None,
73+
conversation_scenario: Optional[ConversationScenario] = None,
7474
) -> EvaluationResult:
7575
if self._expected_invocations_required and expected_invocations is None:
7676
raise ValueError("expected_invocations is needed by this metric.")
77+
del conversation_scenario # not supported for per-invocation evaluation.
7778

7879
# If expected_invocation are not required by the metric and if they are not
7980
# supplied, we provide a list of None.

0 commit comments

Comments
 (0)