Skip to content

Commit 55a57b0

Browse files
committed
remove task json file, use the one from the webarena-verified library. Update task template to include revision number
1 parent 75738c4 commit 55a57b0

File tree

10 files changed

+847
-24817
lines changed

10 files changed

+847
-24817
lines changed

browsergym/experiments/src/browsergym/experiments/benchmark/metadata/webarena_verified.csv

Lines changed: 812 additions & 812 deletions
Large diffs are not rendered by default.

browsergym/experiments/src/browsergym/experiments/benchmark/utils.py

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -157,16 +157,16 @@ def prepare_backend(backend: str):
157157
)
158158
massage_tasks(
159159
[
160-
f"webarena_verified.{intent_template_id}.{task_id}"
161-
for intent_template_id, task_id in [
162-
(23, 410), # reddit
163-
(330, 533), # gitlab
164-
(87, 561), # gitlab wiki
165-
(88, 562), # gitlab reddit
166-
(165, 574), # shopping
167-
(16, 640), # reddit
168-
(253, 680), # shopping_admin
169-
(94, 740), # wiki map
160+
f"webarena_verified.{intent_template_id}.{task_id}.{revision}"
161+
for intent_template_id, task_id, revision in [
162+
(23, 410, 2), # reddit
163+
(330, 533, 2), # gitlab
164+
(87, 561, 3), # gitlab wiki
165+
(88, 562, 2), # gitlab reddit
166+
(165, 574, 2), # shopping
167+
(16, 640, 2), # reddit
168+
(253, 680, 2), # shopping_admin
169+
(94, 740, 2), # wiki map
170170
]
171171
]
172172
)

browsergym/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ authors = [
1616
{name = "Thibault Le Sellier De Chezelles"},
1717
{name = "Tom Marty"},
1818
{name = "Aman Jaiswal"},
19+
{name = "Nicolas Gontier"},
1920
]
2021
readme = "README.md"
2122
requires-python = ">3.10"

browsergym/webarena_verified/README.md

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -49,4 +49,10 @@ benchmark = benchmark.subset_from_list(
4949
)
5050
```
5151

52-
**NOTE**: Tasks are registered with this template: `webarena_verified.{intent_template_id}.{task_id}`
52+
#### 3. Task gym ID format
53+
54+
Tasks are registered to gym with this template: `webarena_verified.{intent_template_id}.{task_id}.{revision}`
55+
56+
- the `intent_template_id` (int) refers to the template of the question. Multiple tasks can have the same template question but with different instantiations.
57+
- the `task_id` (int from 0 to 811) is unique to each question. This is the same task ID as in the original webarena benchmark.
58+
- the `revision` (int) is a version number to keep track of updates done to all webarena-verified tasks accross time.

browsergym/webarena_verified/pyproject.toml

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -29,11 +29,6 @@ path = "../core/src/browsergym/core/__init__.py"
2929
[tool.hatch.metadata.hooks.requirements_txt]
3030
files = ["requirements.txt"]
3131

32-
[tool.hatch.build]
33-
include = [
34-
"src/browsergym/webarena_verified/webarena_verified.json"
35-
]
36-
3732
[tool.hatch.metadata]
3833
allow-direct-references = true
3934

browsergym/webarena_verified/src/browsergym/webarena_verified/__init__.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,10 @@
1414
ALL_WEBARENA_TASK_IDS = []
1515

1616
# register all WebArena benchmark
17-
for task_id, intent_template_id in zip(config.TASK_IDS, config.INTENT_TEMPLATE_IDS):
18-
gym_id = f"webarena_verified.{intent_template_id}.{task_id}"
17+
for task_id, intent_template_id, revision in zip(
18+
config.TASK_IDS, config.INTENT_TEMPLATE_IDS, config.REVISIONS
19+
):
20+
gym_id = f"webarena_verified.{intent_template_id}.{task_id}.{revision}"
1921
register_task(
2022
gym_id,
2123
task.WebArenaVerifiedTask,

browsergym/webarena_verified/src/browsergym/webarena_verified/config.py

Lines changed: 7 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -5,34 +5,19 @@
55

66
TASK_IDS = range(812)
77
INTENT_TEMPLATE_IDS = []
8+
REVISIONS = []
89

9-
with open(Path(__file__).parent / "webarena_verified.json", "r") as f:
10-
data = json.load(f)
11-
12-
# Check if the json file is the same as the one in the webarena-verified repository
13-
library_json_string = (
10+
# Load the json file from the webarena-verified library
11+
data = json.loads(
1412
importlib.resources.files("webarena_verified")
1513
.joinpath("assets/dataset/webarena-verified.json")
1614
.read_text()
1715
)
18-
library_json = json.loads(library_json_string)
19-
20-
if json.dumps(data, sort_keys=True, indent=2) != json.dumps(library_json, sort_keys=True, indent=2):
21-
print(
22-
"Warning: the json file is not the same as the one in the webarena-verified repository. Consider updating the library."
23-
)
24-
print("=" * 100)
25-
print("Differences:")
26-
for diff in difflib.unified_diff(
27-
json.dumps(data, sort_keys=True, indent=2).splitlines(),
28-
json.dumps(library_json, sort_keys=True, indent=2).splitlines(),
29-
):
30-
print(diff)
31-
print("=" * 100)
3216

3317
for task in data:
3418
INTENT_TEMPLATE_IDS.append(task["intent_template_id"])
19+
REVISIONS.append(task["revision"])
3520

36-
assert len(INTENT_TEMPLATE_IDS) == len(
37-
TASK_IDS
38-
), "Number of intent template IDs must match number of task IDs"
21+
assert (
22+
len(INTENT_TEMPLATE_IDS) == len(TASK_IDS) == len(REVISIONS)
23+
), f"Number of intent template IDs ({len(INTENT_TEMPLATE_IDS)}), task IDs ({len(TASK_IDS)}), and revisions ({len(REVISIONS)}) must match"

browsergym/webarena_verified/src/browsergym/webarena_verified/evaluators.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
from platform-labs-agent-eval-harness.
44
"""
55

6+
import importlib.resources
67
import json
78
import logging
89
import tempfile
@@ -46,7 +47,9 @@ def __init__(self, webarena_instance: WebArenaInstance):
4647
"""
4748
# Create configuration for all sites and homepage from webarena_instance
4849
config = WebArenaVerifiedConfig(
49-
test_data_file=Path(__file__).parent.joinpath("webarena_verified.json"),
50+
test_data_file=importlib.resources.files("webarena_verified").joinpath(
51+
"assets/dataset/webarena-verified.json"
52+
),
5053
environments={
5154
**{
5255
site: EnvironmentConfig(

browsergym/webarena_verified/src/browsergym/webarena_verified/task.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -46,8 +46,8 @@ def __init__(
4646

4747
# Load the webarena_verified.json file
4848
all_configs_str = (
49-
importlib.resources.files("browsergym.webarena_verified")
50-
.joinpath("webarena_verified.json")
49+
importlib.resources.files("webarena_verified")
50+
.joinpath("assets/dataset/webarena-verified.json")
5151
.read_text()
5252
)
5353

0 commit comments

Comments
 (0)