22
33from judgeval .v1 import Judgeval
44from judgeval .v1 .data import Example
5- from judgeval . v1 . scorers . prompt_scorer . prompt_scorer import PromptScorer
5+ from e2etests . conftest import ScorerFactory
66
77
8- def run_eval_helper (client : Judgeval , eval_run_name : str ):
8+ def run_eval_helper (client : Judgeval , eval_run_name : str , local_scorer : ScorerFactory ):
99 example1 = Example .create (
1010 input = "Generate a cold outreach email for TechCorp. Facts: They recently launched an AI-powered analytics platform. Their CEO Sarah Chen previously worked at Google. They have 50+ enterprise clients." ,
1111 actual_output = "Dear Ms. Chen,\n \n I noticed TechCorp's recent launch of your AI analytics platform and was impressed by its enterprise-focused approach. Your experience from Google clearly shines through in building scalable solutions, as evidenced by your impressive 50+ enterprise client base.\n \n Would you be open to a brief call to discuss how we could potentially collaborate?\n \n Best regards,\n Alex" ,
@@ -18,14 +18,8 @@ def run_eval_helper(client: Judgeval, eval_run_name: str):
1818 retrieval_context = "GreenEnergy Solutions won 2023 sustainability award. New solar technology 30% more efficient. Planning European market expansion." ,
1919 )
2020
21- scorer = PromptScorer (
22- name = "faithfulness" ,
23- prompt = "Is the output faithful to the retrieval context?" ,
24- threshold = 0.5 ,
25- )
26- scorer2 = PromptScorer (
27- name = "relevancy" , prompt = "Is the output relevant to the input?" , threshold = 0.5
28- )
21+ scorer = local_scorer ("Is the output faithful to the retrieval context?" )
22+ scorer2 = local_scorer ("Is the output relevant to the input?" )
2923
3024 evaluation = client .evaluation .create ()
3125 res = evaluation .run (
@@ -36,7 +30,7 @@ def run_eval_helper(client: Judgeval, eval_run_name: str):
3630 return res
3731
3832
39- def test_basic_eval (client : Judgeval , random_name : str ):
33+ def test_basic_eval (client : Judgeval , random_name : str , local_scorer : ScorerFactory ):
4034 evaluation = client .evaluation .create ()
4135 res = evaluation .run (
4236 examples = [
@@ -45,28 +39,22 @@ def test_basic_eval(client: Judgeval, random_name: str):
4539 actual_output = "The capital of France is Paris." ,
4640 )
4741 ],
48- scorers = [
49- PromptScorer (
50- name = "relevancy" ,
51- prompt = "Is the output relevant to the input?" ,
52- threshold = 0.5 ,
53- )
54- ],
42+ scorers = [local_scorer ("Is the output relevant to the input?" )],
5543 eval_run_name = random_name ,
5644 )
5745
5846 assert res , "No evaluation results found"
5947
6048
61- def test_run_eval (client : Judgeval , random_name : str ):
62- res = run_eval_helper (client , random_name )
49+ def test_run_eval (client : Judgeval , random_name : str , local_scorer : ScorerFactory ):
50+ res = run_eval_helper (client , random_name , local_scorer )
6351 assert res , f"No evaluation results found for { random_name } "
6452
65- res2 = run_eval_helper (client , random_name )
53+ res2 = run_eval_helper (client , random_name , local_scorer )
6654 assert res2 , f"No evaluation results found for { random_name } "
6755
6856
69- def test_assert_test (client : Judgeval ):
57+ def test_assert_test (client : Judgeval , local_scorer : ScorerFactory ):
7058 example = Example .create (
7159 input = "What if these shoes don't fit?" ,
7260 actual_output = "We offer a 30-day full refund at no extra cost." ,
@@ -82,9 +70,7 @@ def test_assert_test(client: Judgeval):
8270 actual_output = "No, the room is too small." ,
8371 )
8472
85- scorer = PromptScorer (
86- name = "relevancy" , prompt = "Is the output relevant to the input?" , threshold = 0.5
87- )
73+ scorer = local_scorer ("Is the output relevant to the input?" )
8874
8975 evaluation = client .evaluation .create ()
9076 with pytest .raises (AssertionError ):
@@ -96,7 +82,9 @@ def test_assert_test(client: Judgeval):
9682 )
9783
9884
99- def test_evaluate_dataset (client : Judgeval , random_name : str ):
85+ def test_evaluate_dataset (
86+ client : Judgeval , random_name : str , local_scorer : ScorerFactory
87+ ):
10088 example1 = Example .create (
10189 input = "What if these shoes don't fit?" ,
10290 actual_output = "We offer a 30-day full refund at no extra cost." ,
@@ -113,19 +101,15 @@ def test_evaluate_dataset(client: Judgeval, random_name: str):
113101 evaluation = client .evaluation .create ()
114102 res = evaluation .run (
115103 examples = list (dataset ),
116- scorers = [
117- PromptScorer (
118- name = "faithfulness" ,
119- prompt = "Is the output faithful to the retrieval context?" ,
120- threshold = 0.5 ,
121- )
122- ],
104+ scorers = [local_scorer ("Is the output faithful to the retrieval context?" )],
123105 eval_run_name = random_name ,
124106 )
125107 assert res , "Dataset evaluation failed"
126108
127109
128- def test_dataset_and_evaluation (client : Judgeval , random_name : str ):
110+ def test_dataset_and_evaluation (
111+ client : Judgeval , random_name : str , local_scorer : ScorerFactory
112+ ):
129113 examples = [
130114 Example .create (input = "input 1" , actual_output = "output 1" ),
131115 Example .create (input = "input 2" , actual_output = "output 2" ),
@@ -138,19 +122,15 @@ def test_dataset_and_evaluation(client: Judgeval, random_name: str):
138122 evaluation = client .evaluation .create ()
139123 res = evaluation .run (
140124 examples = examples ,
141- scorers = [
142- PromptScorer (
143- name = "relevancy" ,
144- prompt = "Is the output relevant to the input?" ,
145- threshold = 0.5 ,
146- )
147- ],
125+ scorers = [local_scorer ("Is the output relevant to the input?" )],
148126 eval_run_name = random_name ,
149127 )
150128 assert res , "Dataset evaluation failed"
151129
152130
153- def test_dataset_and_double_evaluation (client : Judgeval , random_name : str ):
131+ def test_dataset_and_double_evaluation (
132+ client : Judgeval , random_name : str , local_scorer : ScorerFactory
133+ ):
154134 examples = [
155135 Example .create (input = "input 1" , actual_output = "output 1" ),
156136 Example .create (input = "input 2" , actual_output = "output 2" ),
@@ -160,29 +140,19 @@ def test_dataset_and_double_evaluation(client: Judgeval, random_name: str):
160140 assert dataset , "Failed to pull dataset"
161141 assert len (dataset ) == 2 , "Dataset should have 2 examples"
162142
143+ scorer = local_scorer ("Is the output relevant to the input?" )
144+
163145 evaluation = client .evaluation .create ()
164146 res = evaluation .run (
165147 examples = list (dataset ),
166- scorers = [
167- PromptScorer (
168- name = "relevancy" ,
169- prompt = "Is the output relevant to the input?" ,
170- threshold = 0.5 ,
171- )
172- ],
148+ scorers = [scorer ],
173149 eval_run_name = random_name ,
174150 )
175151 assert res , "Dataset evaluation failed"
176152
177153 res2 = evaluation .run (
178154 examples = list (dataset ),
179- scorers = [
180- PromptScorer (
181- name = "relevancy" ,
182- prompt = "Is the output relevant to the input?" ,
183- threshold = 0.5 ,
184- )
185- ],
155+ scorers = [scorer ],
186156 eval_run_name = random_name ,
187157 )
188158 assert res2 , "Dataset evaluation failed"
0 commit comments