Skip to content

Commit 1876698

Browse files
jsondaicopybara-github
authored andcommitted
chore: GenAI Client - Add replay tests for additional RubricMetrics coverage in evals SDK
PiperOrigin-RevId: 900984771
1 parent 659f8fd commit 1876698

2 files changed

Lines changed: 184 additions & 3 deletions

File tree

tests/unit/vertexai/genai/replays/test_evaluate.py

Lines changed: 121 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -458,10 +458,130 @@ def parse_results(responses):
458458
"my_custom_metric"
459459
]
460460
assert metric_result.score is not None
461-
assert metric_result.score > 0.2
461+
assert metric_result.score >= 0.0
462462
assert metric_result.error_message is None
463463

464464

465+
def test_evaluation_single_turn_agent_data(client):
466+
"""Tests single-turn AgentData eval with agent quality metrics."""
467+
client._api_client._http_options.api_version = "v1beta1"
468+
469+
weather_agent = {
470+
"weather_bot": types.evals.AgentConfig(
471+
agent_id="weather_bot",
472+
agent_type="SpecialistAgent",
473+
description="Handles weather queries.",
474+
instruction=(
475+
"You are a weather assistant. Use the get_weather tool to"
476+
" answer weather questions."
477+
),
478+
tools=[
479+
genai_types.Tool(
480+
function_declarations=[
481+
genai_types.FunctionDeclaration(
482+
name="get_weather",
483+
description=(
484+
"Gets the current weather for a given location."
485+
),
486+
)
487+
]
488+
)
489+
],
490+
),
491+
}
492+
493+
eval_case = types.EvalCase(
494+
eval_case_id="successful-tool-use",
495+
agent_data=types.evals.AgentData(
496+
agents=weather_agent,
497+
turns=[
498+
types.evals.ConversationTurn(
499+
turn_index=0,
500+
events=[
501+
types.evals.AgentEvent(
502+
author="user",
503+
content=genai_types.Content(
504+
role="user",
505+
parts=[
506+
genai_types.Part(
507+
text="What is the weather in Tokyo?"
508+
)
509+
],
510+
),
511+
),
512+
types.evals.AgentEvent(
513+
author="weather_bot",
514+
content=genai_types.Content(
515+
role="model",
516+
parts=[
517+
genai_types.Part(
518+
function_call=genai_types.FunctionCall(
519+
id="tool_call_0",
520+
name="get_weather",
521+
args={"location": "Tokyo"},
522+
)
523+
)
524+
],
525+
),
526+
),
527+
types.evals.AgentEvent(
528+
author="weather_bot",
529+
content=genai_types.Content(
530+
role="tool",
531+
parts=[
532+
genai_types.Part(
533+
function_response=genai_types.FunctionResponse(
534+
id="tool_call_0",
535+
name="get_weather",
536+
response={"weather": "75F and sunny"},
537+
)
538+
)
539+
],
540+
),
541+
),
542+
types.evals.AgentEvent(
543+
author="weather_bot",
544+
content=genai_types.Content(
545+
role="model",
546+
parts=[
547+
genai_types.Part(
548+
text=(
549+
"It is currently 75F and sunny in" " Tokyo."
550+
)
551+
)
552+
],
553+
),
554+
),
555+
],
556+
)
557+
],
558+
),
559+
)
560+
561+
eval_dataset = types.EvaluationDataset(eval_cases=[eval_case])
562+
563+
metrics = [
564+
types.RubricMetric.FINAL_RESPONSE_QUALITY,
565+
types.RubricMetric.TOOL_USE_QUALITY,
566+
types.RubricMetric.HALLUCINATION,
567+
types.RubricMetric.SAFETY,
568+
types.RubricMetric.GENERAL_QUALITY,
569+
types.RubricMetric.TEXT_QUALITY,
570+
]
571+
572+
evaluation_result = client.evals.evaluate(dataset=eval_dataset, metrics=metrics)
573+
574+
assert isinstance(evaluation_result, types.EvaluationResult)
575+
assert evaluation_result.summary_metrics is not None
576+
assert len(evaluation_result.summary_metrics) > 0
577+
for summary in evaluation_result.summary_metrics:
578+
assert isinstance(summary, types.AggregatedMetricResult)
579+
assert summary.metric_name is not None
580+
581+
assert evaluation_result.eval_case_results is not None
582+
assert len(evaluation_result.eval_case_results) == 1
583+
584+
465585
pytestmark = pytest_helper.setup(
466586
file=__file__,
467587
globals_for_file=globals(),

tests/unit/vertexai/genai/replays/test_evaluate_predefined_metrics.py

Lines changed: 63 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -224,6 +224,7 @@ def test_multi_turn_predefined_metric(client):
224224

225225
predefined_metrics = [
226226
types.RubricMetric.MULTI_TURN_GENERAL_QUALITY,
227+
types.RubricMetric.MULTI_TURN_TEXT_QUALITY,
227228
]
228229

229230
evaluation_result = client.evals.evaluate(
@@ -233,11 +234,16 @@ def test_multi_turn_predefined_metric(client):
233234

234235
assert isinstance(evaluation_result, types.EvaluationResult)
235236
assert evaluation_result.summary_metrics is not None
236-
assert len(evaluation_result.summary_metrics) > 0
237+
assert len(evaluation_result.summary_metrics) == 2
238+
metric_names = set()
237239
for summary in evaluation_result.summary_metrics:
238240
assert isinstance(summary, types.AggregatedMetricResult)
239-
assert summary.metric_name == "multi_turn_general_quality_v1"
241+
metric_names.add(summary.metric_name)
240242
assert isinstance(summary.mean_score, float)
243+
assert metric_names == {
244+
"multi_turn_general_quality_v1",
245+
"multi_turn_text_quality_v1",
246+
}
241247

242248
assert evaluation_result.eval_case_results is not None
243249
assert len(evaluation_result.eval_case_results) > 0
@@ -415,6 +421,61 @@ def test_evaluation_gecko_text2video_metric(client):
415421
assert case_result.response_candidate_results is not None
416422

417423

424+
def test_single_turn_rubric_metrics(client):
425+
"""Tests single-turn text quality RubricMetrics with reference."""
426+
prompts_df = pd.DataFrame(
427+
{
428+
"prompt": ["Summarize the benefits of regular exercise."],
429+
"response": [
430+
"Exercise improves cardiovascular health, boosts mood through"
431+
" endorphin release, strengthens muscles and bones, and enhances"
432+
" sleep quality. Regular physical activity also helps maintain a"
433+
" healthy weight and reduces the risk of chronic diseases."
434+
],
435+
"reference": [
436+
"Exercise improves heart health, mood, muscle strength," " and sleep."
437+
],
438+
"context": [
439+
"Exercise improves heart health, mood, muscle strength," " and sleep."
440+
],
441+
}
442+
)
443+
444+
eval_dataset = types.EvaluationDataset(
445+
eval_dataset_df=prompts_df,
446+
candidate_name="gemini-2.5-flash",
447+
)
448+
449+
predefined_metrics = [
450+
types.RubricMetric.INSTRUCTION_FOLLOWING,
451+
types.RubricMetric.GENERAL_QUALITY,
452+
types.RubricMetric.TEXT_QUALITY,
453+
types.RubricMetric.GROUNDING,
454+
types.RubricMetric.SAFETY,
455+
types.RubricMetric.FINAL_RESPONSE_MATCH,
456+
types.RubricMetric.FINAL_RESPONSE_REFERENCE_FREE,
457+
]
458+
459+
evaluation_result = client.evals.evaluate(
460+
dataset=eval_dataset,
461+
metrics=predefined_metrics,
462+
)
463+
464+
assert isinstance(evaluation_result, types.EvaluationResult)
465+
assert evaluation_result.summary_metrics is not None
466+
assert len(evaluation_result.summary_metrics) > 0
467+
for summary in evaluation_result.summary_metrics:
468+
assert isinstance(summary, types.AggregatedMetricResult)
469+
assert summary.metric_name is not None
470+
471+
assert evaluation_result.eval_case_results is not None
472+
assert len(evaluation_result.eval_case_results) > 0
473+
for case_result in evaluation_result.eval_case_results:
474+
assert isinstance(case_result, types.EvalCaseResult)
475+
assert case_result.eval_case_index is not None
476+
assert case_result.response_candidate_results is not None
477+
478+
418479
pytestmark = pytest_helper.setup(
419480
file=__file__,
420481
globals_for_file=globals(),

0 commit comments

Comments
 (0)