Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions deepeval/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
RoleAdherenceMetric,
)
from .conversational_g_eval.conversational_g_eval import ConversationalGEval
from .trust_score.trust_score import TrustScoreMetric
from .multimodal_metrics import (
TextToImageMetric,
ImageEditingMetric,
Expand Down Expand Up @@ -129,4 +130,5 @@
"ImageCoherenceMetric",
"ImageHelpfulnessMetric",
"ImageReferenceMetric",
"TrustScoreMetric",
]
3 changes: 3 additions & 0 deletions deepeval/metrics/trust_score/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .trust_score import TrustScoreMetric

__all__ = ["TrustScoreMetric"]
125 changes: 125 additions & 0 deletions deepeval/metrics/trust_score/trust_score.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
from typing import Dict, List
from deepeval.metrics import BaseMetric
from deepeval.test_case import LLMTestCase, LLMTestCaseParams


class TrustScoreMetric(BaseMetric):
_required_params: List[LLMTestCaseParams] = [
LLMTestCaseParams.INPUT,
LLMTestCaseParams.ACTUAL_OUTPUT,
LLMTestCaseParams.RETRIEVAL_CONTEXT,
]

def __init__(
self,
source_tiers: Dict[str, int],
threshold: float = 0.7,
verbose_mode: bool = False,
):
self.source_tiers = source_tiers
self.threshold = threshold
self.verbose_mode = verbose_mode

def measure(
self,
test_case: LLMTestCase,
_show_indicator: bool = True,
_in_component: bool = False,
_log_metric_to_confident: bool = True,
) -> float:
from deepeval.metrics.utils import check_llm_test_case_params
from deepeval.metrics.indicator import metric_progress_indicator

check_llm_test_case_params(
test_case,
self._required_params,
None,
None,
self,
None,
test_case.multimodal,
)

with metric_progress_indicator(
self, _show_indicator=_show_indicator, _in_component=_in_component
):
if (
not test_case.retrieval_context
or len(test_case.retrieval_context) == 0
):
self.score = 1.0
self.reason = "No retrieval context provided."
self.success = self.score >= self.threshold
return self.score

total_score = 0.0
reasons = []

for context_chunk in test_case.retrieval_context:
matched_tier = None
matched_source = None
# Check for source matching (case-insensitive)
for source, tier in self.source_tiers.items():
if source.lower() in context_chunk.lower():
matched_tier = tier
matched_source = source
break

if matched_tier == 1:
chunk_score = 1.0
elif matched_tier == 2:
chunk_score = 0.8
elif matched_tier == 3:
chunk_score = 0.6
elif matched_tier == 4:
chunk_score = 0.4
elif matched_tier == 5:
chunk_score = 0.2
else:
chunk_score = 0.5
matched_source = "Unmatched Source"
matched_tier = "None"

total_score += chunk_score
reasons.append(
f"'{matched_source}' mapped to tier {matched_tier}"
)

self.score = total_score / len(test_case.retrieval_context)
self.reason = "Sources found: " + ", ".join(reasons)
self.success = self.score >= self.threshold

if _log_metric_to_confident:
from deepeval.metrics.api import metric_data_manager

metric_data_manager.post_metric_if_enabled(
self, test_case=test_case
)

return self.score

async def a_measure(
self,
test_case: LLMTestCase,
_show_indicator: bool = True,
_in_component: bool = False,
) -> float:
return self.measure(
test_case,
_show_indicator=_show_indicator,
_in_component=_in_component,
)

def is_successful(self) -> bool:
if self.error is not None:
self.success = False
else:
try:
self.success = self.score >= self.threshold
except Exception:
self.success = False
return self.success

@property
def __name__(self):
return "Trust Score"
3 changes: 3 additions & 0 deletions test_import.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from deepeval.metrics import TrustScoreMetric

print("Successfully imported TrustScoreMetric!")
109 changes: 109 additions & 0 deletions tests/test_trust_score_metric.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
from deepeval.metrics import TrustScoreMetric
from deepeval.test_case import LLMTestCase


def test_high_trust():
source_tiers = {"SEC Filings": 1, "Verified Blog": 2, "Unverified Post": 4}
metric = TrustScoreMetric(source_tiers=source_tiers)
test_case = LLMTestCase(
input="What is Apple's revenue?",
actual_output="Apple's revenue is 394 billion.",
retrieval_context=[
"According to SEC filings, Apple's revenue is 394 billion."
],
)
metric.measure(test_case)
assert metric.score == 1.0
assert metric.success is True
assert "SEC Filings" in metric.reason
assert "tier 1" in metric.reason


def test_low_trust():
source_tiers = {"SEC Filings": 1, "Verified Blog": 2, "Unverified Post": 4}
metric = TrustScoreMetric(source_tiers=source_tiers)
test_case = LLMTestCase(
input="What is Apple's revenue?",
actual_output="Apple's revenue is 394 billion.",
retrieval_context=[
"I read in an unverified post that Apple's revenue is 394 billion."
],
)
metric.measure(test_case)
assert metric.score == 0.4
assert metric.success is False
assert "Unverified Post" in metric.reason
assert "tier 4" in metric.reason


def test_mixed_sources():
source_tiers = {"SEC Filings": 1, "Verified Blog": 2, "Unverified Post": 4}
metric = TrustScoreMetric(source_tiers=source_tiers)
test_case = LLMTestCase(
input="What is Apple's revenue?",
actual_output="Apple's revenue is 394 billion.",
retrieval_context=[
"According to SEC filings, Apple's revenue is 394 billion.",
"I read in an unverified post that Apple's revenue is 394 billion.",
],
)
metric.measure(test_case)
assert metric.score == 0.7 # (1.0 + 0.4) / 2
assert metric.success is True


def test_unmatched_source():
source_tiers = {"SEC Filings": 1}
metric = TrustScoreMetric(source_tiers=source_tiers)
test_case = LLMTestCase(
input="What is Apple's revenue?",
actual_output="Apple's revenue is 394 billion.",
retrieval_context=[
"A random guy told me Apple's revenue is 394 billion."
],
)
metric.measure(test_case)
assert metric.score == 0.5
assert metric.success is False


def test_threshold_pass():
source_tiers = {"Verified Blog": 2} # Tier 2 gives 0.8
metric = TrustScoreMetric(source_tiers=source_tiers, threshold=0.7)
test_case = LLMTestCase(
input="What is Apple's revenue?",
actual_output="Apple's revenue is 394 billion.",
retrieval_context=[
"According to a Verified Blog, Apple's revenue is 394 billion."
],
)
metric.measure(test_case)
assert metric.success is True


def test_threshold_fail():
source_tiers = {"Verified Blog": 2} # Tier 2 gives 0.8
metric = TrustScoreMetric(source_tiers=source_tiers, threshold=0.9)
test_case = LLMTestCase(
input="What is Apple's revenue?",
actual_output="Apple's revenue is 394 billion.",
retrieval_context=[
"According to a Verified Blog, Apple's revenue is 394 billion."
],
)
metric.measure(test_case)
assert metric.success is False


def test_empty_retrieval_context():
source_tiers = {"SEC Filings": 1}
metric = TrustScoreMetric(source_tiers=source_tiers)
test_case = LLMTestCase(
input="What is Apple's revenue?",
actual_output="Apple's revenue is 394 billion.",
retrieval_context=[],
)
metric.measure(test_case)
assert metric.score == 1.0
assert metric.success is True
assert "No retrieval context provided" in metric.reason
Loading