Skip to content

Commit 895b5b8

Browse files
Release: Merge staging to main
2 parents eee829c + 5bed54a commit 895b5b8

8 files changed

Lines changed: 210 additions & 56 deletions

File tree

Lines changed: 99 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,99 @@
1+
on:
2+
workflow_dispatch:
3+
inputs:
4+
major_version:
5+
description: 'Major version to release'
6+
required: true
7+
default: 0
8+
type: number
9+
minor_version:
10+
description: 'Minor version to release'
11+
required: true
12+
type: number
13+
patch_version:
14+
description: 'Patch version to release'
15+
required: true
16+
type: number
17+
18+
jobs:
19+
validate:
20+
runs-on: ubuntu-latest
21+
env:
22+
RELEASE_VERSION: ${{ github.event.inputs.major_version }}.${{ github.event.inputs.minor_version }}.${{ github.event.inputs.patch_version }}
23+
steps:
24+
- name: Validate version is available on PyPI
25+
run: |
26+
status_code=$(curl -s -o /dev/null -w "%{http_code}" "https://pypi.org/pypi/judgeval/$RELEASE_VERSION/json")
27+
if [ "$status_code" = "200" ]; then
28+
echo "Error: Version $RELEASE_VERSION already exists on PyPI"
29+
exit 1
30+
fi
31+
echo "Version $RELEASE_VERSION is available"
32+
33+
release:
34+
needs: validate
35+
runs-on: ubuntu-latest
36+
env:
37+
RELEASE_VERSION: ${{ github.event.inputs.major_version }}.${{ github.event.inputs.minor_version }}.${{ github.event.inputs.patch_version }}
38+
steps:
39+
- name: Checkout code
40+
uses: actions/checkout@v4
41+
with:
42+
fetch-depth: 0
43+
44+
- name: Install Python
45+
uses: actions/setup-python@v4
46+
with:
47+
python-version: 3.11
48+
49+
- name: Create new tag
50+
run: |
51+
git config user.name "github-actions"
52+
git config user.email "github-actions@github.com"
53+
git tag "v$RELEASE_VERSION"
54+
git push origin "v$RELEASE_VERSION"
55+
56+
- name: Create GitHub release
57+
uses: softprops/action-gh-release@v2
58+
with:
59+
tag_name: v${{ env.RELEASE_VERSION }}
60+
generate_release_notes: true
61+
body: |
62+
You can find this package release on PyPI: https://pypi.org/project/judgeval/${{ env.RELEASE_VERSION }}/
63+
env:
64+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
65+
66+
- name: Bump pyproject.toml and version.py version
67+
run: python update_version.py "$RELEASE_VERSION"
68+
69+
- name: Build PyPI package
70+
run: |
71+
python -m pip install --upgrade build
72+
python -m build
73+
74+
- name: Create PyPI release
75+
run: |
76+
python -m pip install --upgrade twine
77+
python -m twine upload --repository pypi -u ${{ secrets.PYPI_USERNAME }} -p ${{ secrets.PYPI_PASSWORD }} dist/*
78+
79+
cleanup:
80+
needs: release
81+
if: failure()
82+
runs-on: ubuntu-latest
83+
env:
84+
RELEASE_VERSION: ${{ github.event.inputs.major_version }}.${{ github.event.inputs.minor_version }}.${{ github.event.inputs.patch_version }}
85+
steps:
86+
- name: Checkout code
87+
uses: actions/checkout@v4
88+
with:
89+
fetch-depth: 0
90+
91+
- name: Authenticate GitHub CLI
92+
run: echo "${{ secrets.GITHUB_TOKEN }}" | gh auth login --with-token
93+
94+
- name: Delete tag and release
95+
run: |
96+
gh release delete "v$RELEASE_VERSION" --yes || true
97+
git push --delete origin "v$RELEASE_VERSION" || true
98+
env:
99+
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}

src/e2etests/conftest.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,15 @@
22
import pytest
33
import random
44
import string
5+
from typing import Callable
56
from dotenv import load_dotenv
67

78
from judgeval.v1 import Judgeval
9+
from judgeval.v1.judges import Judge, BinaryResponse
810
from e2etests.utils import delete_project, create_project
11+
from judgeval.v1.data.example import Example
12+
13+
ScorerFactory = Callable[[str], Judge[BinaryResponse]]
914

1015
load_dotenv()
1116

@@ -38,3 +43,33 @@ def client(project_name: str):
3843
@pytest.fixture
3944
def random_name() -> str:
4045
return "".join(random.choices(string.ascii_letters + string.digits, k=12))
46+
47+
48+
@pytest.fixture
49+
def local_scorer() -> ScorerFactory:
50+
def _make(prompt: str) -> Judge[BinaryResponse]:
51+
class LLMScorer(Judge[BinaryResponse]):
52+
async def score(self, data: Example) -> BinaryResponse:
53+
from openai import AsyncOpenAI
54+
55+
client = AsyncOpenAI()
56+
response = await client.chat.completions.parse(
57+
model="gpt-4o-mini",
58+
messages=[
59+
{"role": "system", "content": prompt},
60+
{
61+
"role": "user",
62+
"content": (
63+
f"Input: {data['input']}\n"
64+
f"Output: {data['actual_output']}"
65+
),
66+
},
67+
],
68+
response_format=BinaryResponse,
69+
)
70+
result = response.choices[0].message.parsed
71+
return result if result else BinaryResponse(value=False, reason="Error")
72+
73+
return LLMScorer()
74+
75+
return _make

src/e2etests/test_eval_operations.py

Lines changed: 26 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -2,10 +2,10 @@
22

33
from judgeval.v1 import Judgeval
44
from judgeval.v1.data import Example
5-
from judgeval.v1.scorers.prompt_scorer.prompt_scorer import PromptScorer
5+
from e2etests.conftest import ScorerFactory
66

77

8-
def run_eval_helper(client: Judgeval, eval_run_name: str):
8+
def run_eval_helper(client: Judgeval, eval_run_name: str, local_scorer: ScorerFactory):
99
example1 = Example.create(
1010
input="Generate a cold outreach email for TechCorp. Facts: They recently launched an AI-powered analytics platform. Their CEO Sarah Chen previously worked at Google. They have 50+ enterprise clients.",
1111
actual_output="Dear Ms. Chen,\n\nI noticed TechCorp's recent launch of your AI analytics platform and was impressed by its enterprise-focused approach. Your experience from Google clearly shines through in building scalable solutions, as evidenced by your impressive 50+ enterprise client base.\n\nWould you be open to a brief call to discuss how we could potentially collaborate?\n\nBest regards,\nAlex",
@@ -18,14 +18,8 @@ def run_eval_helper(client: Judgeval, eval_run_name: str):
1818
retrieval_context="GreenEnergy Solutions won 2023 sustainability award. New solar technology 30% more efficient. Planning European market expansion.",
1919
)
2020

21-
scorer = PromptScorer(
22-
name="faithfulness",
23-
prompt="Is the output faithful to the retrieval context?",
24-
threshold=0.5,
25-
)
26-
scorer2 = PromptScorer(
27-
name="relevancy", prompt="Is the output relevant to the input?", threshold=0.5
28-
)
21+
scorer = local_scorer("Is the output faithful to the retrieval context?")
22+
scorer2 = local_scorer("Is the output relevant to the input?")
2923

3024
evaluation = client.evaluation.create()
3125
res = evaluation.run(
@@ -36,7 +30,7 @@ def run_eval_helper(client: Judgeval, eval_run_name: str):
3630
return res
3731

3832

39-
def test_basic_eval(client: Judgeval, random_name: str):
33+
def test_basic_eval(client: Judgeval, random_name: str, local_scorer: ScorerFactory):
4034
evaluation = client.evaluation.create()
4135
res = evaluation.run(
4236
examples=[
@@ -45,28 +39,22 @@ def test_basic_eval(client: Judgeval, random_name: str):
4539
actual_output="The capital of France is Paris.",
4640
)
4741
],
48-
scorers=[
49-
PromptScorer(
50-
name="relevancy",
51-
prompt="Is the output relevant to the input?",
52-
threshold=0.5,
53-
)
54-
],
42+
scorers=[local_scorer("Is the output relevant to the input?")],
5543
eval_run_name=random_name,
5644
)
5745

5846
assert res, "No evaluation results found"
5947

6048

61-
def test_run_eval(client: Judgeval, random_name: str):
62-
res = run_eval_helper(client, random_name)
49+
def test_run_eval(client: Judgeval, random_name: str, local_scorer: ScorerFactory):
50+
res = run_eval_helper(client, random_name, local_scorer)
6351
assert res, f"No evaluation results found for {random_name}"
6452

65-
res2 = run_eval_helper(client, random_name)
53+
res2 = run_eval_helper(client, random_name, local_scorer)
6654
assert res2, f"No evaluation results found for {random_name}"
6755

6856

69-
def test_assert_test(client: Judgeval):
57+
def test_assert_test(client: Judgeval, local_scorer: ScorerFactory):
7058
example = Example.create(
7159
input="What if these shoes don't fit?",
7260
actual_output="We offer a 30-day full refund at no extra cost.",
@@ -82,9 +70,7 @@ def test_assert_test(client: Judgeval):
8270
actual_output="No, the room is too small.",
8371
)
8472

85-
scorer = PromptScorer(
86-
name="relevancy", prompt="Is the output relevant to the input?", threshold=0.5
87-
)
73+
scorer = local_scorer("Is the output relevant to the input?")
8874

8975
evaluation = client.evaluation.create()
9076
with pytest.raises(AssertionError):
@@ -96,7 +82,9 @@ def test_assert_test(client: Judgeval):
9682
)
9783

9884

99-
def test_evaluate_dataset(client: Judgeval, random_name: str):
85+
def test_evaluate_dataset(
86+
client: Judgeval, random_name: str, local_scorer: ScorerFactory
87+
):
10088
example1 = Example.create(
10189
input="What if these shoes don't fit?",
10290
actual_output="We offer a 30-day full refund at no extra cost.",
@@ -113,19 +101,15 @@ def test_evaluate_dataset(client: Judgeval, random_name: str):
113101
evaluation = client.evaluation.create()
114102
res = evaluation.run(
115103
examples=list(dataset),
116-
scorers=[
117-
PromptScorer(
118-
name="faithfulness",
119-
prompt="Is the output faithful to the retrieval context?",
120-
threshold=0.5,
121-
)
122-
],
104+
scorers=[local_scorer("Is the output faithful to the retrieval context?")],
123105
eval_run_name=random_name,
124106
)
125107
assert res, "Dataset evaluation failed"
126108

127109

128-
def test_dataset_and_evaluation(client: Judgeval, random_name: str):
110+
def test_dataset_and_evaluation(
111+
client: Judgeval, random_name: str, local_scorer: ScorerFactory
112+
):
129113
examples = [
130114
Example.create(input="input 1", actual_output="output 1"),
131115
Example.create(input="input 2", actual_output="output 2"),
@@ -138,19 +122,15 @@ def test_dataset_and_evaluation(client: Judgeval, random_name: str):
138122
evaluation = client.evaluation.create()
139123
res = evaluation.run(
140124
examples=examples,
141-
scorers=[
142-
PromptScorer(
143-
name="relevancy",
144-
prompt="Is the output relevant to the input?",
145-
threshold=0.5,
146-
)
147-
],
125+
scorers=[local_scorer("Is the output relevant to the input?")],
148126
eval_run_name=random_name,
149127
)
150128
assert res, "Dataset evaluation failed"
151129

152130

153-
def test_dataset_and_double_evaluation(client: Judgeval, random_name: str):
131+
def test_dataset_and_double_evaluation(
132+
client: Judgeval, random_name: str, local_scorer: ScorerFactory
133+
):
154134
examples = [
155135
Example.create(input="input 1", actual_output="output 1"),
156136
Example.create(input="input 2", actual_output="output 2"),
@@ -160,29 +140,19 @@ def test_dataset_and_double_evaluation(client: Judgeval, random_name: str):
160140
assert dataset, "Failed to pull dataset"
161141
assert len(dataset) == 2, "Dataset should have 2 examples"
162142

143+
scorer = local_scorer("Is the output relevant to the input?")
144+
163145
evaluation = client.evaluation.create()
164146
res = evaluation.run(
165147
examples=list(dataset),
166-
scorers=[
167-
PromptScorer(
168-
name="relevancy",
169-
prompt="Is the output relevant to the input?",
170-
threshold=0.5,
171-
)
172-
],
148+
scorers=[scorer],
173149
eval_run_name=random_name,
174150
)
175151
assert res, "Dataset evaluation failed"
176152

177153
res2 = evaluation.run(
178154
examples=list(dataset),
179-
scorers=[
180-
PromptScorer(
181-
name="relevancy",
182-
prompt="Is the output relevant to the input?",
183-
threshold=0.5,
184-
)
185-
],
155+
scorers=[scorer],
186156
eval_run_name=random_name,
187157
)
188158
assert res2, "Dataset evaluation failed"

src/judgeval/judgment_attribute_keys.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ class AttributeKeys(str, Enum):
1010
JUDGMENT_OFFLINE_MODE = "judgment.offline_mode"
1111
JUDGMENT_UPDATE_ID = "judgment.update_id"
1212
JUDGMENT_CUSTOMER_ID = "judgment.customer_id"
13+
JUDGMENT_CUSTOMER_USER_ID = "judgment.customer_user_id"
1314
JUDGMENT_AGENT_ID = "judgment.agent_id"
1415
JUDGMENT_PARENT_AGENT_ID = "judgment.parent_agent_id"
1516
JUDGMENT_AGENT_CLASS_NAME = "judgment.agent_class_name"

src/judgeval/v1/tracer/base_tracer.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
AGENT_ID_KEY,
5858
PARENT_AGENT_ID_KEY,
5959
CUSTOMER_ID_KEY,
60+
CUSTOMER_USER_ID_KEY,
6061
SESSION_ID_KEY,
6162
AGENT_CLASS_NAME_KEY,
6263
AGENT_INSTANCE_NAME_KEY,
@@ -259,6 +260,16 @@ def set_customer_id(self, customer_id: str) -> None:
259260
ctx = set_value(CUSTOMER_ID_KEY, customer_id, self.get_context())
260261
self._attach_context(ctx)
261262

263+
def set_customer_user_id(self, customer_user_id: str) -> None:
264+
current_span = self.get_current_span()
265+
if current_span is None:
266+
return
267+
current_span.set_attribute(
268+
AttributeKeys.JUDGMENT_CUSTOMER_USER_ID, customer_user_id
269+
)
270+
ctx = set_value(CUSTOMER_USER_ID_KEY, customer_user_id, self.get_context())
271+
self._attach_context(ctx)
272+
262273
def set_session_id(self, session_id: str) -> None:
263274
current_span = self._get_sampled_span()
264275
if current_span is None:

src/judgeval/v1/tracer/processors/_lifecycles/__init__.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,9 @@
33
from judgeval.v1.tracer.processors._lifecycles.customer_id_processor import (
44
CustomerIdProcessor,
55
)
6+
from judgeval.v1.tracer.processors._lifecycles.customer_user_id_processor import (
7+
CustomerUserIdProcessor,
8+
)
69
from judgeval.v1.tracer.processors._lifecycles.session_id_processor import (
710
SessionIdProcessor,
811
)
@@ -15,6 +18,7 @@
1518
from judgeval.v1.tracer.processors._lifecycles.registry import get_all, register
1619
from judgeval.v1.tracer.processors._lifecycles.context_keys import (
1720
CUSTOMER_ID_KEY,
21+
CUSTOMER_USER_ID_KEY,
1822
SESSION_ID_KEY,
1923
AGENT_ID_KEY,
2024
PARENT_AGENT_ID_KEY,
@@ -25,12 +29,14 @@
2529

2630
__all__ = [
2731
"CustomerIdProcessor",
32+
"CustomerUserIdProcessor",
2833
"SessionIdProcessor",
2934
"AgentIdProcessor",
3035
"ProjectIdOverrideProcessor",
3136
"get_all",
3237
"register",
3338
"CUSTOMER_ID_KEY",
39+
"CUSTOMER_USER_ID_KEY",
3440
"SESSION_ID_KEY",
3541
"AGENT_ID_KEY",
3642
"PARENT_AGENT_ID_KEY",

0 commit comments

Comments
 (0)