Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
e9bfab0
Fix GT for admin_001_002
krithika-ramakrishnan Nov 4, 2025
c94c884
Fix GT issue in admin_001_003
krithika-ramakrishnan Nov 4, 2025
e46733d
Fix prerequisite issue for sales_005_005
krithika-ramakrishnan Nov 4, 2025
7be380c
Add separate test scripts for evaluation, prerequisites installation …
krithika-ramakrishnan Nov 4, 2025
43f520a
Clarify the error messages in admin_032
krithika-ramakrishnan Nov 6, 2025
b7dd738
Replace bulk delete with parallel single deletes to get failure ack
krithika-ramakrishnan Nov 6, 2025
c4c5103
Remove redundant evaluation testing code
krithika-ramakrishnan Nov 6, 2025
31a6c3e
Fix exact description matching issue in sales_005
krithika-ramakrishnan Nov 6, 2025
a396dcc
Move from print statements to logging, instantiate evaluators only once
krithika-ramakrishnan Nov 7, 2025
b7871e8
Mitigate sales_001 issue
krithika-ramakrishnan Nov 7, 2025
2527821
Fix GT issue in admin_003_002
krithika-ramakrishnan Nov 7, 2025
a42e7b5
Reset every batch in main_bu
krithika-ramakrishnan Nov 10, 2025
38260a3
Install prerequisites one by one
krithika-ramakrishnan Nov 10, 2025
5721c23
Log resetting errors
krithika-ramakrishnan Nov 10, 2025
d4561a8
Fix Product2 reset issue
krithika-ramakrishnan Nov 10, 2025
cfd2128
Show object delete failures
krithika-ramakrishnan Nov 10, 2025
8445b78
Fix service_002 reset issue
krithika-ramakrishnan Nov 10, 2025
f56c996
Fix UserRole reset issue for admin_035
krithika-ramakrishnan Nov 10, 2025
d02aed1
Fix dependency of Queue on Assignment rule for service_001
krithika-ramakrishnan Nov 10, 2025
0753164
Add logging to test files to print log to stdout
krithika-ramakrishnan Nov 10, 2025
711feec
update readme
Yutong-Dai Feb 16, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 18 additions & 8 deletions data/test_demo_aug.json
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@
"\"100,000\""
]
],
"field_name": "Amount"
"field_name": "AMOUNT"
},
"difficulty": "medium",
"query_template_name": "create_report",
Expand Down Expand Up @@ -98,7 +98,7 @@
"human_review_comment": "rejected since it is the same. Rewriten",
"ground_truth_dict": {
"object_name": "Accounts",
"report_name": "ActiveAccountsRPT",
"report_name": "MyAccountsRPT",
"filter_type": "standard filter",
"filters": [
[
Expand Down Expand Up @@ -538,7 +538,7 @@
"operator2": "equals",
"value2": "Closed Lost",
"field3": "AMOUNT",
"operator3": "greaterThan",
"operator3": "greaterOrEqual",
"value3": "\"5,000\"",
"field4": "TYPE",
"operator4": "equals",
Expand Down Expand Up @@ -8386,6 +8386,8 @@
}
},
"objects": [
"Quote",
"Opportunity",
"PricebookEntry",
"Pricebook2"
],
Expand Down Expand Up @@ -8429,6 +8431,8 @@
}
},
"objects": [
"Quote",
"Opportunity",
"PricebookEntry",
"Pricebook2"
],
Expand Down Expand Up @@ -8472,6 +8476,8 @@
}
},
"objects": [
"Quote",
"Opportunity",
"PricebookEntry",
"Pricebook2"
],
Expand Down Expand Up @@ -8515,6 +8521,8 @@
}
},
"objects": [
"Quote",
"Opportunity",
"PricebookEntry",
"Pricebook2"
],
Expand Down Expand Up @@ -8558,6 +8566,8 @@
}
},
"objects": [
"Quote",
"Opportunity",
"PricebookEntry",
"Pricebook2"
],
Expand Down Expand Up @@ -10488,7 +10498,7 @@
"QueueSobject",
"Queue"
],
"metadata_types": []
"metadata_types": ["AssignmentRules"]
},
"instance_dict": {
"queue_name": "Shoe Case Support",
Expand Down Expand Up @@ -10536,7 +10546,7 @@
"QueueSobject",
"Queue"
],
"metadata_types": []
"metadata_types": ["AssignmentRules"]
},
"instance_dict": {
"queue_name": "VIP Support",
Expand Down Expand Up @@ -10584,7 +10594,7 @@
"QueueSobject",
"Queue"
],
"metadata_types": []
"metadata_types": ["AssignmentRules"]
},
"instance_dict": {
"queue_name": "Marketing Support",
Expand Down Expand Up @@ -10631,7 +10641,7 @@
"QueueSobject",
"Queue"
],
"metadata_types": []
"metadata_types": ["AssignmentRules"]
},
"instance_dict": {
"queue_name": "tech support specialists",
Expand Down Expand Up @@ -10679,7 +10689,7 @@
"QueueSobject",
"Queue"
],
"metadata_types": []
"metadata_types": ["AssignmentRules"]
},
"instance_dict": {
"queue_name": "enterprise client support",
Expand Down
26 changes: 18 additions & 8 deletions data/test_zero_shot.json
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@
"\"100,000\""
]
],
"field_name": "Amount"
"field_name": "AMOUNT"
},
"difficulty": "medium",
"query_template_name": "create_report",
Expand Down Expand Up @@ -96,7 +96,7 @@
"human_review_comment": "rejected since it is the same. Rewriten",
"ground_truth_dict": {
"object_name": "Accounts",
"report_name": "ActiveAccountsRPT",
"report_name": "MyAccountsRPT",
"filter_type": "standard filter",
"filters": [
[
Expand Down Expand Up @@ -527,7 +527,7 @@
"operator2": "equals",
"value2": "Closed Lost",
"field3": "AMOUNT",
"operator3": "greaterThan",
"operator3": "greaterOrEqual",
"value3": "\"5,000\"",
"field4": "TYPE",
"operator4": "equals",
Expand Down Expand Up @@ -8185,6 +8185,8 @@
}
},
"objects": [
"Quote",
"Opportunity",
"PricebookEntry",
"Pricebook2"
],
Expand Down Expand Up @@ -8227,6 +8229,8 @@
}
},
"objects": [
"Quote",
"Opportunity",
"PricebookEntry",
"Pricebook2"
],
Expand Down Expand Up @@ -8269,6 +8273,8 @@
}
},
"objects": [
"Quote",
"Opportunity",
"PricebookEntry",
"Pricebook2"
],
Expand Down Expand Up @@ -8311,6 +8317,8 @@
}
},
"objects": [
"Quote",
"Opportunity",
"PricebookEntry",
"Pricebook2"
],
Expand Down Expand Up @@ -8353,6 +8361,8 @@
}
},
"objects": [
"Quote",
"Opportunity",
"PricebookEntry",
"Pricebook2"
],
Expand Down Expand Up @@ -10243,7 +10253,7 @@
"QueueSobject",
"Queue"
],
"metadata_types": []
"metadata_types": ["AssignmentRules"]
},
"instance_dict": {
"queue_name": "Shoe Case Support",
Expand Down Expand Up @@ -10290,7 +10300,7 @@
"QueueSobject",
"Queue"
],
"metadata_types": []
"metadata_types": ["AssignmentRules"]
},
"instance_dict": {
"queue_name": "VIP Support",
Expand Down Expand Up @@ -10337,7 +10347,7 @@
"QueueSobject",
"Queue"
],
"metadata_types": []
"metadata_types": ["AssignmentRules"]
},
"instance_dict": {
"queue_name": "Marketing Support",
Expand Down Expand Up @@ -10383,7 +10393,7 @@
"QueueSobject",
"Queue"
],
"metadata_types": []
"metadata_types": ["AssignmentRules"]
},
"instance_dict": {
"queue_name": "tech support specialists",
Expand Down Expand Up @@ -10430,7 +10440,7 @@
"QueueSobject",
"Queue"
],
"metadata_types": []
"metadata_types": ["AssignmentRules"]
},
"instance_dict": {
"queue_name": "enterprise client support",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -369,18 +369,7 @@ sudo systemctl start osworld_server.service

### Accessibility Tree Support

To support the accessibility tree functionality, you'll need to install pyastpi2 in your Ubuntu environment. This package enables access to accessibility information and tree structures.

Installation steps:

```bash
# Update package list and ensure pip is installed
sudo apt-get update
sudo apt-get install python3-pip

# Install pyastpi2 using pip
pip3 install pyastpi2
```
Due to Salesforce security requirements, we’re unable to provide the solution here. Please refer to the instructions in the original repository and proceed at your own discretion.

### Xorg Configuration

Expand Down
23 changes: 13 additions & 10 deletions main_bu.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,10 @@
import glob
from playwright.async_api import async_playwright

from utils import run_evaluate, run_reset, LogFormatter, split_task_config_pool_into_batches
from utils import run_evaluate, run_reset, LogFormatter, split_task_config_pool_into_batches, capture_logs_to_file
from args import get_args

from scuba.phases.evaluation.master_evaluator import MilestoneEvaluator
from scuba.phases.resetter import Resetter
from scuba.helpers.salesforce_commands import authorize_using_access_token, install_initial_data, retrieve_initial_state_metadata, create_project_if_not_exists
# build env and agent
from browser_use import Controller
Expand Down Expand Up @@ -203,7 +202,7 @@ async def replan(param_model:NoParamsAction, browser: BrowserContextBugFix) -> A

async with asyncio.Lock():
try:
evaluator = MilestoneEvaluator(args.org_alias)

# breakpoint()
score_card = evaluator.evaluate_instance(task_instance_dict, agent_answer)
evaluation_result = score_card.__dict__()
Expand Down Expand Up @@ -339,13 +338,7 @@ async def test(args: argparse.Namespace, task_config_pool: List[Dict]) -> None:
total_batches = len(task_config_pool_batches)
logger.info(f"Split the task_config_pool into {total_batches} batches due to constraints and dependencies of different tasks")

if args.reset_orgs_before_eval:
# Since the reset and evaluation are based on local files; we need to reset the salesforce orgs first
logger.info(f"Bulk resetting the salesforce orgs...")
time_start = time.perf_counter()
run_reset(task_config_pool, args.org_alias)
time_end = time.perf_counter()
logger.info(f"Done bulk resetting the salesforce orgs in {time_end - time_start:.2f} seconds")

if args.solutions == 'bu':
# build auxilary components
retriever = None
Expand Down Expand Up @@ -374,6 +367,15 @@ async def test(args: argparse.Namespace, task_config_pool: List[Dict]) -> None:
for batch_idx, task_config_pool in enumerate(task_config_pool_batches):
num_tasks = len(task_config_pool)
logger.info(f"Starting batch {batch_idx} with {num_tasks} tasks")
# Since the reset and evaluation are based on local files; we need to reset the salesforce orgs first
logger.info(f"Bulk resetting the salesforce orgs...")
time_start=time.perf_counter()
file=os.path.join(args.result_dir,"reset.log")
with capture_logs_to_file(file):
run_reset(task_config_pool, args.org_alias)

time_end=time.perf_counter()
logger.info(f"Done bulk resetting the salesforce orgs in {time_end-time_start:.2f} seconds")
semaphore = asyncio.Semaphore(args.max_concurrent_tasks)
job_queue = []
for task_instance_dict in task_config_pool:
Expand Down Expand Up @@ -410,6 +412,7 @@ def get_unfinished_task_ids(task_instance_dicts: List[Dict], target_dir: str):
if __name__ == '__main__':
args = get_args()
assert args.org_alias == os.getenv("ORG_ALIAS"), f"org_alias: {args.org_alias} is not the same as the org_alias in the .env file: {os.getenv('ORG_ALIAS')}. The one in the .env file is used to login in the remote desktop environment."
evaluator=MilestoneEvaluator(args.org_alias)
args.result_dir = os.path.join(args.result_dir, args.run_name)

assert args.total_desired_envs == args.max_concurrent_tasks, f"total_desired_envs: {args.total_desired_envs} is not the same as max_concurrent_tasks: {args.max_concurrent_tasks}"
Expand Down
Loading