Release: Merge staging to main

judgment-labs-release-bot · judgment-labs-release-bot · commit 895b5b86c91a · 2026-03-24T03:55:48.000Z
diff --git a/.github/workflows/manual-release.yaml b/.github/workflows/manual-release.yaml
@@ -0,0 +1,99 @@
+on:
+  workflow_dispatch:
+    inputs:
+      major_version:
+        description: 'Major version to release'
+        required: true
+        default: 0
+        type: number
+      minor_version:
+        description: 'Minor version to release'
+        required: true
+        type: number
+      patch_version:
+        description: 'Patch version to release'
+        required: true
+        type: number
+
+jobs:
+  validate:
+    runs-on: ubuntu-latest
+    env:
+      RELEASE_VERSION: ${{ github.event.inputs.major_version }}.${{ github.event.inputs.minor_version }}.${{ github.event.inputs.patch_version }}
+    steps:
+      - name: Validate version is available on PyPI
+        run: |
+          status_code=$(curl -s -o /dev/null -w "%{http_code}" "https://pypi.org/pypi/judgeval/$RELEASE_VERSION/json")
+          if [ "$status_code" = "200" ]; then
+            echo "Error: Version $RELEASE_VERSION already exists on PyPI"
+            exit 1
+          fi
+          echo "Version $RELEASE_VERSION is available"
+
+  release:
+    needs: validate
+    runs-on: ubuntu-latest
+    env:
+      RELEASE_VERSION: ${{ github.event.inputs.major_version }}.${{ github.event.inputs.minor_version }}.${{ github.event.inputs.patch_version }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Install Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+
+      - name: Create new tag
+        run: |
+          git config user.name "github-actions"
+          git config user.email "github-actions@github.com"
+          git tag "v$RELEASE_VERSION"
+          git push origin "v$RELEASE_VERSION"
+
+      - name: Create GitHub release
+        uses: softprops/action-gh-release@v2
+        with:
+          tag_name: v${{ env.RELEASE_VERSION }}
+          generate_release_notes: true
+          body: |
+            You can find this package release on PyPI: https://pypi.org/project/judgeval/${{ env.RELEASE_VERSION }}/
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+
+      - name: Bump pyproject.toml and version.py version
+        run: python update_version.py "$RELEASE_VERSION"
+
+      - name: Build PyPI package
+        run: |
+          python -m pip install --upgrade build
+          python -m build
+
+      - name: Create PyPI release
+        run: |
+          python -m pip install --upgrade twine
+          python -m twine upload --repository pypi -u ${{ secrets.PYPI_USERNAME }} -p ${{ secrets.PYPI_PASSWORD }} dist/*
+
+  cleanup:
+    needs: release
+    if: failure()
+    runs-on: ubuntu-latest
+    env:
+      RELEASE_VERSION: ${{ github.event.inputs.major_version }}.${{ github.event.inputs.minor_version }}.${{ github.event.inputs.patch_version }}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          fetch-depth: 0
+
+      - name: Authenticate GitHub CLI
+        run: echo "${{ secrets.GITHUB_TOKEN }}" | gh auth login --with-token
+
+      - name: Delete tag and release
+        run: |
+          gh release delete "v$RELEASE_VERSION" --yes || true
+          git push --delete origin "v$RELEASE_VERSION" || true
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/src/e2etests/conftest.py b/src/e2etests/conftest.py
@@ -2,10 +2,15 @@
 import pytest
 import random
 import string
+from typing import Callable
 from dotenv import load_dotenv
 
 from judgeval.v1 import Judgeval
+from judgeval.v1.judges import Judge, BinaryResponse
 from e2etests.utils import delete_project, create_project
+from judgeval.v1.data.example import Example
+
+ScorerFactory = Callable[[str], Judge[BinaryResponse]]
 
 load_dotenv()
 
@@ -38,3 +43,33 @@ def client(project_name: str):
 @pytest.fixture
 def random_name() -> str:
     return "".join(random.choices(string.ascii_letters + string.digits, k=12))
+
+
+@pytest.fixture
+def local_scorer() -> ScorerFactory:
+    def _make(prompt: str) -> Judge[BinaryResponse]:
+        class LLMScorer(Judge[BinaryResponse]):
+            async def score(self, data: Example) -> BinaryResponse:
+                from openai import AsyncOpenAI
+
+                client = AsyncOpenAI()
+                response = await client.chat.completions.parse(
+                    model="gpt-4o-mini",
+                    messages=[
+                        {"role": "system", "content": prompt},
+                        {
+                            "role": "user",
+                            "content": (
+                                f"Input: {data['input']}\n"
+                                f"Output: {data['actual_output']}"
+                            ),
+                        },
+                    ],
+                    response_format=BinaryResponse,
+                )
+                result = response.choices[0].message.parsed
+                return result if result else BinaryResponse(value=False, reason="Error")
+
+        return LLMScorer()
+
+    return _make
diff --git a/src/e2etests/test_eval_operations.py b/src/e2etests/test_eval_operations.py
@@ -2,10 +2,10 @@
 
 from judgeval.v1 import Judgeval
 from judgeval.v1.data import Example
-from judgeval.v1.scorers.prompt_scorer.prompt_scorer import PromptScorer
+from e2etests.conftest import ScorerFactory
 
 
-def run_eval_helper(client: Judgeval, eval_run_name: str):
+def run_eval_helper(client: Judgeval, eval_run_name: str, local_scorer: ScorerFactory):
     example1 = Example.create(
         input="Generate a cold outreach email for TechCorp. Facts: They recently launched an AI-powered analytics platform. Their CEO Sarah Chen previously worked at Google. They have 50+ enterprise clients.",
         actual_output="Dear Ms. Chen,\n\nI noticed TechCorp's recent launch of your AI analytics platform and was impressed by its enterprise-focused approach. Your experience from Google clearly shines through in building scalable solutions, as evidenced by your impressive 50+ enterprise client base.\n\nWould you be open to a brief call to discuss how we could potentially collaborate?\n\nBest regards,\nAlex",
@@ -18,14 +18,8 @@ def run_eval_helper(client: Judgeval, eval_run_name: str):
         retrieval_context="GreenEnergy Solutions won 2023 sustainability award. New solar technology 30% more efficient. Planning European market expansion.",
     )
 
-    scorer = PromptScorer(
-        name="faithfulness",
-        prompt="Is the output faithful to the retrieval context?",
-        threshold=0.5,
-    )
-    scorer2 = PromptScorer(
-        name="relevancy", prompt="Is the output relevant to the input?", threshold=0.5
-    )
+    scorer = local_scorer("Is the output faithful to the retrieval context?")
+    scorer2 = local_scorer("Is the output relevant to the input?")
 
     evaluation = client.evaluation.create()
     res = evaluation.run(
@@ -36,7 +30,7 @@ def run_eval_helper(client: Judgeval, eval_run_name: str):
     return res
 
 
-def test_basic_eval(client: Judgeval, random_name: str):
+def test_basic_eval(client: Judgeval, random_name: str, local_scorer: ScorerFactory):
     evaluation = client.evaluation.create()
     res = evaluation.run(
         examples=[
@@ -45,28 +39,22 @@ def test_basic_eval(client: Judgeval, random_name: str):
                 actual_output="The capital of France is Paris.",
             )
         ],
-        scorers=[
-            PromptScorer(
-                name="relevancy",
-                prompt="Is the output relevant to the input?",
-                threshold=0.5,
-            )
-        ],
+        scorers=[local_scorer("Is the output relevant to the input?")],
         eval_run_name=random_name,
     )
 
     assert res, "No evaluation results found"
 
 
-def test_run_eval(client: Judgeval, random_name: str):
-    res = run_eval_helper(client, random_name)
+def test_run_eval(client: Judgeval, random_name: str, local_scorer: ScorerFactory):
+    res = run_eval_helper(client, random_name, local_scorer)
     assert res, f"No evaluation results found for {random_name}"
 
-    res2 = run_eval_helper(client, random_name)
+    res2 = run_eval_helper(client, random_name, local_scorer)
     assert res2, f"No evaluation results found for {random_name}"
 
 
-def test_assert_test(client: Judgeval):
+def test_assert_test(client: Judgeval, local_scorer: ScorerFactory):
     example = Example.create(
         input="What if these shoes don't fit?",
         actual_output="We offer a 30-day full refund at no extra cost.",
@@ -82,9 +70,7 @@ def test_assert_test(client: Judgeval):
         actual_output="No, the room is too small.",
     )
 
-    scorer = PromptScorer(
-        name="relevancy", prompt="Is the output relevant to the input?", threshold=0.5
-    )
+    scorer = local_scorer("Is the output relevant to the input?")
 
     evaluation = client.evaluation.create()
     with pytest.raises(AssertionError):
@@ -96,7 +82,9 @@ def test_assert_test(client: Judgeval):
         )
 
 
-def test_evaluate_dataset(client: Judgeval, random_name: str):
+def test_evaluate_dataset(
+    client: Judgeval, random_name: str, local_scorer: ScorerFactory
+):
     example1 = Example.create(
         input="What if these shoes don't fit?",
         actual_output="We offer a 30-day full refund at no extra cost.",
@@ -113,19 +101,15 @@ def test_evaluate_dataset(client: Judgeval, random_name: str):
     evaluation = client.evaluation.create()
     res = evaluation.run(
         examples=list(dataset),
-        scorers=[
-            PromptScorer(
-                name="faithfulness",
-                prompt="Is the output faithful to the retrieval context?",
-                threshold=0.5,
-            )
-        ],
+        scorers=[local_scorer("Is the output faithful to the retrieval context?")],
         eval_run_name=random_name,
     )
     assert res, "Dataset evaluation failed"
 
 
-def test_dataset_and_evaluation(client: Judgeval, random_name: str):
+def test_dataset_and_evaluation(
+    client: Judgeval, random_name: str, local_scorer: ScorerFactory
+):
     examples = [
         Example.create(input="input 1", actual_output="output 1"),
         Example.create(input="input 2", actual_output="output 2"),
@@ -138,19 +122,15 @@ def test_dataset_and_evaluation(client: Judgeval, random_name: str):
     evaluation = client.evaluation.create()
     res = evaluation.run(
         examples=examples,
-        scorers=[
-            PromptScorer(
-                name="relevancy",
-                prompt="Is the output relevant to the input?",
-                threshold=0.5,
-            )
-        ],
+        scorers=[local_scorer("Is the output relevant to the input?")],
         eval_run_name=random_name,
     )
     assert res, "Dataset evaluation failed"
 
 
-def test_dataset_and_double_evaluation(client: Judgeval, random_name: str):
+def test_dataset_and_double_evaluation(
+    client: Judgeval, random_name: str, local_scorer: ScorerFactory
+):
     examples = [
         Example.create(input="input 1", actual_output="output 1"),
         Example.create(input="input 2", actual_output="output 2"),
@@ -160,29 +140,19 @@ def test_dataset_and_double_evaluation(client: Judgeval, random_name: str):
     assert dataset, "Failed to pull dataset"
     assert len(dataset) == 2, "Dataset should have 2 examples"
 
+    scorer = local_scorer("Is the output relevant to the input?")
+
     evaluation = client.evaluation.create()
     res = evaluation.run(
         examples=list(dataset),
-        scorers=[
-            PromptScorer(
-                name="relevancy",
-                prompt="Is the output relevant to the input?",
-                threshold=0.5,
-            )
-        ],
+        scorers=[scorer],
         eval_run_name=random_name,
     )
     assert res, "Dataset evaluation failed"
 
     res2 = evaluation.run(
         examples=list(dataset),
-        scorers=[
-            PromptScorer(
-                name="relevancy",
-                prompt="Is the output relevant to the input?",
-                threshold=0.5,
-            )
-        ],
+        scorers=[scorer],
         eval_run_name=random_name,
     )
     assert res2, "Dataset evaluation failed"
diff --git a/src/judgeval/judgment_attribute_keys.py b/src/judgeval/judgment_attribute_keys.py
@@ -10,6 +10,7 @@ class AttributeKeys(str, Enum):
     JUDGMENT_OFFLINE_MODE = "judgment.offline_mode"
     JUDGMENT_UPDATE_ID = "judgment.update_id"
     JUDGMENT_CUSTOMER_ID = "judgment.customer_id"
+    JUDGMENT_CUSTOMER_USER_ID = "judgment.customer_user_id"
     JUDGMENT_AGENT_ID = "judgment.agent_id"
     JUDGMENT_PARENT_AGENT_ID = "judgment.parent_agent_id"
     JUDGMENT_AGENT_CLASS_NAME = "judgment.agent_class_name"
diff --git a/src/judgeval/v1/tracer/base_tracer.py b/src/judgeval/v1/tracer/base_tracer.py
@@ -57,6 +57,7 @@
     AGENT_ID_KEY,
     PARENT_AGENT_ID_KEY,
     CUSTOMER_ID_KEY,
+    CUSTOMER_USER_ID_KEY,
     SESSION_ID_KEY,
     AGENT_CLASS_NAME_KEY,
     AGENT_INSTANCE_NAME_KEY,
@@ -259,6 +260,16 @@ def set_customer_id(self, customer_id: str) -> None:
         ctx = set_value(CUSTOMER_ID_KEY, customer_id, self.get_context())
         self._attach_context(ctx)
 
+    def set_customer_user_id(self, customer_user_id: str) -> None:
+        current_span = self.get_current_span()
+        if current_span is None:
+            return
+        current_span.set_attribute(
+            AttributeKeys.JUDGMENT_CUSTOMER_USER_ID, customer_user_id
+        )
+        ctx = set_value(CUSTOMER_USER_ID_KEY, customer_user_id, self.get_context())
+        self._attach_context(ctx)
+
     def set_session_id(self, session_id: str) -> None:
         current_span = self._get_sampled_span()
         if current_span is None:
diff --git a/src/judgeval/v1/tracer/processors/_lifecycles/__init__.py b/src/judgeval/v1/tracer/processors/_lifecycles/__init__.py
@@ -3,6 +3,9 @@
 from judgeval.v1.tracer.processors._lifecycles.customer_id_processor import (
     CustomerIdProcessor,
 )
+from judgeval.v1.tracer.processors._lifecycles.customer_user_id_processor import (
+    CustomerUserIdProcessor,
+)
 from judgeval.v1.tracer.processors._lifecycles.session_id_processor import (
     SessionIdProcessor,
 )
@@ -15,6 +18,7 @@
 from judgeval.v1.tracer.processors._lifecycles.registry import get_all, register
 from judgeval.v1.tracer.processors._lifecycles.context_keys import (
     CUSTOMER_ID_KEY,
+    CUSTOMER_USER_ID_KEY,
     SESSION_ID_KEY,
     AGENT_ID_KEY,
     PARENT_AGENT_ID_KEY,
@@ -25,12 +29,14 @@
 
 __all__ = [
     "CustomerIdProcessor",
+    "CustomerUserIdProcessor",
     "SessionIdProcessor",
     "AgentIdProcessor",
     "ProjectIdOverrideProcessor",
     "get_all",
     "register",
     "CUSTOMER_ID_KEY",
+    "CUSTOMER_USER_ID_KEY",
     "SESSION_ID_KEY",
     "AGENT_ID_KEY",
     "PARENT_AGENT_ID_KEY",
diff --git a/src/judgeval/v1/tracer/processors/_lifecycles/context_keys.py b/src/judgeval/v1/tracer/processors/_lifecycles/context_keys.py
diff --git a/src/judgeval/v1/tracer/processors/_lifecycles/customer_user_id_processor.py b/src/judgeval/v1/tracer/processors/_lifecycles/customer_user_id_processor.py