confident-ai · Danish2op · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026 · Apr 22, 2026
diff --git a/deepeval/metrics/__init__.py b/deepeval/metrics/__init__.py
@@ -58,6 +58,7 @@
     RoleAdherenceMetric,
 )
 from .conversational_g_eval.conversational_g_eval import ConversationalGEval
+from .trust_score.trust_score import TrustScoreMetric
 from .multimodal_metrics import (
     TextToImageMetric,
     ImageEditingMetric,
@@ -129,4 +130,5 @@
     "ImageCoherenceMetric",
     "ImageHelpfulnessMetric",
     "ImageReferenceMetric",
+    "TrustScoreMetric",
 ]
diff --git a/deepeval/metrics/trust_score/__init__.py b/deepeval/metrics/trust_score/__init__.py
@@ -0,0 +1,3 @@
+from .trust_score import TrustScoreMetric
+
+__all__ = ["TrustScoreMetric"]
diff --git a/deepeval/metrics/trust_score/trust_score.py b/deepeval/metrics/trust_score/trust_score.py
@@ -0,0 +1,125 @@
+from typing import Dict, List
+from deepeval.metrics import BaseMetric
+from deepeval.test_case import LLMTestCase, LLMTestCaseParams
+
+
+class TrustScoreMetric(BaseMetric):
+    _required_params: List[LLMTestCaseParams] = [
+        LLMTestCaseParams.INPUT,
+        LLMTestCaseParams.ACTUAL_OUTPUT,
+        LLMTestCaseParams.RETRIEVAL_CONTEXT,
+    ]
+
+    def __init__(
+        self,
+        source_tiers: Dict[str, int],
+        threshold: float = 0.7,
+        verbose_mode: bool = False,
+    ):
+        self.source_tiers = source_tiers
+        self.threshold = threshold
+        self.verbose_mode = verbose_mode
+
+    def measure(
+        self,
+        test_case: LLMTestCase,
+        _show_indicator: bool = True,
+        _in_component: bool = False,
+        _log_metric_to_confident: bool = True,
+    ) -> float:
+        from deepeval.metrics.utils import check_llm_test_case_params
+        from deepeval.metrics.indicator import metric_progress_indicator
+
+        check_llm_test_case_params(
+            test_case,
+            self._required_params,
+            None,
+            None,
+            self,
+            None,
+            test_case.multimodal,
+        )
+
+        with metric_progress_indicator(
+            self, _show_indicator=_show_indicator, _in_component=_in_component
+        ):
+            if (
+                not test_case.retrieval_context
+                or len(test_case.retrieval_context) == 0
+            ):
+                self.score = 1.0
+                self.reason = "No retrieval context provided."
+                self.success = self.score >= self.threshold
+                return self.score
+
+            total_score = 0.0
+            reasons = []
+
+            for context_chunk in test_case.retrieval_context:
+                matched_tier = None
+                matched_source = None
+                # Check for source matching (case-insensitive)
+                for source, tier in self.source_tiers.items():
+                    if source.lower() in context_chunk.lower():
+                        matched_tier = tier
+                        matched_source = source
+                        break
+
+                if matched_tier == 1:
+                    chunk_score = 1.0
+                elif matched_tier == 2:
+                    chunk_score = 0.8
+                elif matched_tier == 3:
+                    chunk_score = 0.6
+                elif matched_tier == 4:
+                    chunk_score = 0.4
+                elif matched_tier == 5:
+                    chunk_score = 0.2
+                else:
+                    chunk_score = 0.5
+                    matched_source = "Unmatched Source"
+                    matched_tier = "None"
+
+                total_score += chunk_score
+                reasons.append(
+                    f"'{matched_source}' mapped to tier {matched_tier}"
+                )
+
+            self.score = total_score / len(test_case.retrieval_context)
+            self.reason = "Sources found: " + ", ".join(reasons)
+            self.success = self.score >= self.threshold
+
+            if _log_metric_to_confident:
+                from deepeval.metrics.api import metric_data_manager
+
+                metric_data_manager.post_metric_if_enabled(
+                    self, test_case=test_case
+                )
+
+            return self.score
+
+    async def a_measure(
+        self,
+        test_case: LLMTestCase,
+        _show_indicator: bool = True,
+        _in_component: bool = False,
+    ) -> float:
+        return self.measure(
+            test_case,
+            _show_indicator=_show_indicator,
+            _in_component=_in_component,
+        )
+
+    def is_successful(self) -> bool:
+        if self.error is not None:
+            self.success = False
+        else:
+            try:
+                self.success = self.score >= self.threshold
+            except Exception:
+                self.success = False
+        return self.success
+
+    @property
+    def __name__(self):
+        return "Trust Score"
diff --git a/test_import.py b/test_import.py
@@ -0,0 +1,3 @@
+from deepeval.metrics import TrustScoreMetric
+
+print("Successfully imported TrustScoreMetric!")
diff --git a/tests/test_trust_score_metric.py b/tests/test_trust_score_metric.py
@@ -0,0 +1,109 @@
+from deepeval.metrics import TrustScoreMetric
+from deepeval.test_case import LLMTestCase
+
+
+def test_high_trust():
+    source_tiers = {"SEC Filings": 1, "Verified Blog": 2, "Unverified Post": 4}
+    metric = TrustScoreMetric(source_tiers=source_tiers)
+    test_case = LLMTestCase(
+        input="What is Apple's revenue?",
+        actual_output="Apple's revenue is 394 billion.",
+        retrieval_context=[
+            "According to SEC filings, Apple's revenue is 394 billion."
+        ],
+    )
+    metric.measure(test_case)
+    assert metric.score == 1.0
+    assert metric.success is True
+    assert "SEC Filings" in metric.reason
+    assert "tier 1" in metric.reason
+
+
+def test_low_trust():
+    source_tiers = {"SEC Filings": 1, "Verified Blog": 2, "Unverified Post": 4}
+    metric = TrustScoreMetric(source_tiers=source_tiers)
+    test_case = LLMTestCase(
+        input="What is Apple's revenue?",
+        actual_output="Apple's revenue is 394 billion.",
+        retrieval_context=[
+            "I read in an unverified post that Apple's revenue is 394 billion."
+        ],
+    )
+    metric.measure(test_case)
+    assert metric.score == 0.4
+    assert metric.success is False
+    assert "Unverified Post" in metric.reason
+    assert "tier 4" in metric.reason
+
+
+def test_mixed_sources():
+    source_tiers = {"SEC Filings": 1, "Verified Blog": 2, "Unverified Post": 4}
+    metric = TrustScoreMetric(source_tiers=source_tiers)
+    test_case = LLMTestCase(
+        input="What is Apple's revenue?",
+        actual_output="Apple's revenue is 394 billion.",
+        retrieval_context=[
+            "According to SEC filings, Apple's revenue is 394 billion.",
+            "I read in an unverified post that Apple's revenue is 394 billion.",
+        ],
+    )
+    metric.measure(test_case)
+    assert metric.score == 0.7  # (1.0 + 0.4) / 2
+    assert metric.success is True
+
+
+def test_unmatched_source():
+    source_tiers = {"SEC Filings": 1}
+    metric = TrustScoreMetric(source_tiers=source_tiers)
+    test_case = LLMTestCase(
+        input="What is Apple's revenue?",
+        actual_output="Apple's revenue is 394 billion.",
+        retrieval_context=[
+            "A random guy told me Apple's revenue is 394 billion."
+        ],
+    )
+    metric.measure(test_case)
+    assert metric.score == 0.5
+    assert metric.success is False
+
+
+def test_threshold_pass():
+    source_tiers = {"Verified Blog": 2}  # Tier 2 gives 0.8
+    metric = TrustScoreMetric(source_tiers=source_tiers, threshold=0.7)
+    test_case = LLMTestCase(
+        input="What is Apple's revenue?",
+        actual_output="Apple's revenue is 394 billion.",
+        retrieval_context=[
+            "According to a Verified Blog, Apple's revenue is 394 billion."
+        ],
+    )
+    metric.measure(test_case)
+    assert metric.success is True
+
+
+def test_threshold_fail():
+    source_tiers = {"Verified Blog": 2}  # Tier 2 gives 0.8
+    metric = TrustScoreMetric(source_tiers=source_tiers, threshold=0.9)
+    test_case = LLMTestCase(
+        input="What is Apple's revenue?",
+        actual_output="Apple's revenue is 394 billion.",
+        retrieval_context=[
+            "According to a Verified Blog, Apple's revenue is 394 billion."
+        ],
+    )
+    metric.measure(test_case)
+    assert metric.success is False
+
+
+def test_empty_retrieval_context():
+    source_tiers = {"SEC Filings": 1}
+    metric = TrustScoreMetric(source_tiers=source_tiers)
+    test_case = LLMTestCase(
+        input="What is Apple's revenue?",
+        actual_output="Apple's revenue is 394 billion.",
+        retrieval_context=[],
+    )
+    metric.measure(test_case)
+    assert metric.score == 1.0
+    assert metric.success is True
+    assert "No retrieval context provided" in metric.reason
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from .trust_score import TrustScoreMetric

		__all__ = ["TrustScoreMetric"]
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from deepeval.metrics import TrustScoreMetric

		print("Successfully imported TrustScoreMetric!")