Skip to content

Commit 65547c6

Browse files
fix(eval): clean up eval configs and add test-clado-api script (#540)
Consolidate 13 configs down to 7 with uniform settings: - 3 weekly (CI): browseros-agent, browseros-oe-agent, browseros-oe-clado - 4 test (local): test_gemini-computer-use, test_yutori-navigator, test_webvoyager, test_mind2web - All configs: headless=false, captcha block, full browseros ports, restart_server_per_task Deleted: debug-test, mind2web-test, tool-loop-test, orchestrator-executor-test, orchestrator-executor-clado-test, fireworks-minimax-m2, webvoyager-test Added: test-clado-api.ts script, browseros-oe-agent-weekly.json (OE with AI SDK executor)
1 parent 0babc05 commit 65547c6

13 files changed

+273
-164
lines changed

packages/browseros-agent/apps/eval/configs/orchestrator-executor-test.json renamed to packages/browseros-agent/apps/eval/configs/browseros-oe-agent-weekly.json

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -2,24 +2,21 @@
22
"agent": {
33
"type": "orchestrator-executor",
44
"orchestrator": {
5-
"type": "single",
65
"provider": "openai-compatible",
76
"model": "accounts/fireworks/models/kimi-k2p5",
87
"apiKey": "FIREWORKS_API_KEY",
9-
"baseUrl": "https://api.fireworks.ai/inference/v1",
10-
"supportsImages": true
8+
"baseUrl": "https://api.fireworks.ai/inference/v1"
119
},
1210
"executor": {
1311
"provider": "openai-compatible",
1412
"model": "accounts/fireworks/models/kimi-k2p5",
1513
"apiKey": "FIREWORKS_API_KEY",
16-
"baseUrl": "https://api.fireworks.ai/inference/v1",
17-
"supportsImages": true
14+
"baseUrl": "https://api.fireworks.ai/inference/v1"
1815
}
1916
},
20-
"dataset": "../data/webvoyager_e2e_test.jsonl",
21-
"output_dir": "../results/orchestrator-executor-webvoyager-test",
22-
"num_workers": 3,
17+
"dataset": "../data/webbench-2of4-50.jsonl",
18+
"num_workers": 10,
19+
"restart_server_per_task": true,
2320
"browseros": {
2421
"server_url": "http://127.0.0.1:9110",
2522
"base_cdp_port": 9010,
@@ -28,8 +25,12 @@
2825
"load_extensions": false,
2926
"headless": false
3027
},
28+
"captcha": {
29+
"api_key_env": "NOPECHA_API_KEY"
30+
},
31+
"graders": ["performance_grader"],
3132
"grader_api_key_env": "OPENROUTER_API_KEY",
3233
"grader_base_url": "https://openrouter.ai/api/v1",
3334
"grader_model": "openai/gpt-4.1",
34-
"timeout_ms": 1200000
35+
"timeout_ms": 1800000
3536
}

packages/browseros-agent/apps/eval/configs/browseros-oe-clado-weekly.json

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,10 @@
2323
"base_server_port": 9110,
2424
"base_extension_port": 9310,
2525
"load_extensions": false,
26-
"headless": true
26+
"headless": false
27+
},
28+
"captcha": {
29+
"api_key_env": "NOPECHA_API_KEY"
2730
},
2831
"graders": ["performance_grader"],
2932
"grader_api_key_env": "OPENROUTER_API_KEY",

packages/browseros-agent/apps/eval/configs/debug-test.json

Lines changed: 0 additions & 23 deletions
This file was deleted.

packages/browseros-agent/apps/eval/configs/fireworks-minimax-m2.json

Lines changed: 0 additions & 21 deletions
This file was deleted.

packages/browseros-agent/apps/eval/configs/mind2web-test.json

Lines changed: 0 additions & 18 deletions
This file was deleted.

packages/browseros-agent/apps/eval/configs/orchestrator-executor-clado-test.json

Lines changed: 0 additions & 32 deletions
This file was deleted.

packages/browseros-agent/apps/eval/configs/gemini-computer-use.json renamed to packages/browseros-agent/apps/eval/configs/test_gemini-computer-use.json

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,20 @@
99
"turnLimit": 100
1010
},
1111
"dataset": "../data/test-set.jsonl",
12-
"output_dir": "../results/gemini-computer-use-test-set2",
1312
"num_workers": 1,
1413
"restart_server_per_task": true,
1514
"browseros": {
16-
"server_url": "http://127.0.0.1:9110"
15+
"server_url": "http://127.0.0.1:9110",
16+
"base_cdp_port": 9010,
17+
"base_server_port": 9110,
18+
"base_extension_port": 9310,
19+
"load_extensions": false,
20+
"headless": false
1721
},
22+
"captcha": {
23+
"api_key_env": "NOPECHA_API_KEY"
24+
},
25+
"graders": ["performance_grader"],
1826
"grader_api_key_env": "OPENROUTER_API_KEY",
1927
"grader_base_url": "https://openrouter.ai/api/v1",
2028
"grader_model": "openai/gpt-4.1",

packages/browseros-agent/apps/eval/configs/mind2web-full.json renamed to packages/browseros-agent/apps/eval/configs/test_mind2web.json

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -6,11 +6,20 @@
66
"apiKey": "OPENROUTER_API_KEY"
77
},
88
"dataset": "../data/mind2web.jsonl",
9-
"output_dir": "../results/mind2web-full",
109
"num_workers": 5,
10+
"restart_server_per_task": true,
1111
"browseros": {
12-
"server_url": "http://127.0.0.1:9110"
12+
"server_url": "http://127.0.0.1:9110",
13+
"base_cdp_port": 9010,
14+
"base_server_port": 9110,
15+
"base_extension_port": 9310,
16+
"load_extensions": false,
17+
"headless": false
1318
},
19+
"captcha": {
20+
"api_key_env": "NOPECHA_API_KEY"
21+
},
22+
"graders": ["performance_grader"],
1423
"grader_api_key_env": "OPENROUTER_API_KEY",
1524
"grader_base_url": "https://openrouter.ai/api/v1",
1625
"grader_model": "openai/gpt-4.1",

packages/browseros-agent/apps/eval/configs/webvoyager-full.json renamed to packages/browseros-agent/apps/eval/configs/test_webvoyager.json

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,16 +8,20 @@
88
"supportsImages": true
99
},
1010
"dataset": "../data/webvoyager.jsonl",
11-
"output_dir": "../results/webvoyager-cdp-server",
1211
"num_workers": 3,
12+
"restart_server_per_task": true,
1313
"browseros": {
1414
"server_url": "http://127.0.0.1:9110",
1515
"base_cdp_port": 9010,
1616
"base_server_port": 9110,
1717
"base_extension_port": 9310,
1818
"load_extensions": false,
19-
"headless": true
19+
"headless": false
2020
},
21+
"captcha": {
22+
"api_key_env": "NOPECHA_API_KEY"
23+
},
24+
"graders": ["performance_grader"],
2125
"grader_api_key_env": "OPENROUTER_API_KEY",
2226
"grader_base_url": "https://openrouter.ai/api/v1",
2327
"grader_model": "openai/gpt-4.1",

packages/browseros-agent/apps/eval/configs/yutori-navigator.json renamed to packages/browseros-agent/apps/eval/configs/test_yutori-navigator.json

Lines changed: 12 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,22 @@
99
"turnLimit": 100
1010
},
1111
"dataset": "../data/test-set.jsonl",
12-
"output_dir": "../results/yutori-navigator",
1312
"num_workers": 1,
1413
"restart_server_per_task": true,
1514
"browseros": {
16-
"server_url": "http://127.0.0.1:9110"
15+
"server_url": "http://127.0.0.1:9110",
16+
"base_cdp_port": 9010,
17+
"base_server_port": 9110,
18+
"base_extension_port": 9310,
19+
"load_extensions": false,
20+
"headless": false
1721
},
18-
"timeout_ms": 1200000,
22+
"captcha": {
23+
"api_key_env": "NOPECHA_API_KEY"
24+
},
25+
"graders": ["performance_grader"],
1926
"grader_api_key_env": "OPENROUTER_API_KEY",
2027
"grader_base_url": "https://openrouter.ai/api/v1",
21-
"grader_model": "openai/gpt-4.1"
28+
"grader_model": "openai/gpt-4.1",
29+
"timeout_ms": 1200000
2230
}

0 commit comments

Comments
 (0)