SalesforceAIResearch · kckrithika · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025 · Nov 4, 2025
diff --git a/data/test_demo_aug.json b/data/test_demo_aug.json
@@ -62,7 +62,7 @@
                     "\"100,000\""
                 ]
             ],
-            "field_name": "Amount"
+            "field_name": "AMOUNT"
         },
         "difficulty": "medium",
         "query_template_name": "create_report",
@@ -98,7 +98,7 @@
         "human_review_comment": "rejected since it is the same. Rewriten",
         "ground_truth_dict": {
             "object_name": "Accounts",
-            "report_name": "ActiveAccountsRPT",
+            "report_name": "MyAccountsRPT",
             "filter_type": "standard filter",
             "filters": [
                 [
@@ -538,7 +538,7 @@
             "operator2": "equals",
             "value2": "Closed Lost",
             "field3": "AMOUNT",
-            "operator3": "greaterThan",
+            "operator3": "greaterOrEqual",
             "value3": "\"5,000\"",
             "field4": "TYPE",
             "operator4": "equals",
@@ -8386,6 +8386,8 @@
                 }
             },
             "objects": [
+                "Quote",
+                "Opportunity",
                 "PricebookEntry",
                 "Pricebook2"
             ],
@@ -8429,6 +8431,8 @@
                 }
             },
             "objects": [
+                "Quote",
+                "Opportunity",
                 "PricebookEntry",
                 "Pricebook2"
             ],
@@ -8472,6 +8476,8 @@
                 }
             },
             "objects": [
+                "Quote",
+                "Opportunity",
                 "PricebookEntry",
                 "Pricebook2"
             ],
@@ -8515,6 +8521,8 @@
                 }
             },
             "objects": [
+                "Quote",
+                "Opportunity",
                 "PricebookEntry",
                 "Pricebook2"
             ],
@@ -8558,6 +8566,8 @@
                 }
             },
             "objects": [
+                "Quote",
+                "Opportunity",
                 "PricebookEntry",
                 "Pricebook2"
             ],
@@ -10488,7 +10498,7 @@
                 "QueueSobject",
                 "Queue"
             ],
-            "metadata_types": []
+            "metadata_types": ["AssignmentRules"]
         },
         "instance_dict": {
             "queue_name": "Shoe Case Support",
@@ -10536,7 +10546,7 @@
                 "QueueSobject",
                 "Queue"
             ],
-            "metadata_types": []
+            "metadata_types": ["AssignmentRules"]
         },
         "instance_dict": {
             "queue_name": "VIP Support",
@@ -10584,7 +10594,7 @@
                 "QueueSobject",
                 "Queue"
             ],
-            "metadata_types": []
+            "metadata_types": ["AssignmentRules"]
         },
         "instance_dict": {
             "queue_name": "Marketing Support",
@@ -10631,7 +10641,7 @@
                 "QueueSobject",
                 "Queue"
             ],
-            "metadata_types": []
+            "metadata_types": ["AssignmentRules"]
         },
         "instance_dict": {
             "queue_name": "tech support specialists",
@@ -10679,7 +10689,7 @@
                 "QueueSobject",
                 "Queue"
             ],
-            "metadata_types": []
+            "metadata_types": ["AssignmentRules"]
         },
         "instance_dict": {
             "queue_name": "enterprise client support",

diff --git a/data/test_zero_shot.json b/data/test_zero_shot.json
@@ -61,7 +61,7 @@
                     "\"100,000\""
                 ]
             ],
-            "field_name": "Amount"
+            "field_name": "AMOUNT"
         },
         "difficulty": "medium",
         "query_template_name": "create_report",
@@ -96,7 +96,7 @@
         "human_review_comment": "rejected since it is the same. Rewriten",
         "ground_truth_dict": {
             "object_name": "Accounts",
-            "report_name": "ActiveAccountsRPT",
+            "report_name": "MyAccountsRPT",
             "filter_type": "standard filter",
             "filters": [
                 [
@@ -527,7 +527,7 @@
             "operator2": "equals",
             "value2": "Closed Lost",
             "field3": "AMOUNT",
-            "operator3": "greaterThan",
+            "operator3": "greaterOrEqual",
             "value3": "\"5,000\"",
             "field4": "TYPE",
             "operator4": "equals",
@@ -8185,6 +8185,8 @@
                 }
             },
             "objects": [
+                "Quote",
+                "Opportunity",
                 "PricebookEntry",
                 "Pricebook2"
             ],
@@ -8227,6 +8229,8 @@
                 }
             },
             "objects": [
+                "Quote",
+                "Opportunity",
                 "PricebookEntry",
                 "Pricebook2"
             ],
@@ -8269,6 +8273,8 @@
                 }
             },
             "objects": [
+                "Quote",
+                "Opportunity",
                 "PricebookEntry",
                 "Pricebook2"
             ],
@@ -8311,6 +8317,8 @@
                 }
             },
             "objects": [
+                "Quote",
+                "Opportunity",
                 "PricebookEntry",
                 "Pricebook2"
             ],
@@ -8353,6 +8361,8 @@
                 }
             },
             "objects": [
+                "Quote",
+                "Opportunity",
                 "PricebookEntry",
                 "Pricebook2"
             ],
@@ -10243,7 +10253,7 @@
                 "QueueSobject",
                 "Queue"
             ],
-            "metadata_types": []
+            "metadata_types": ["AssignmentRules"]
         },
         "instance_dict": {
             "queue_name": "Shoe Case Support",
@@ -10290,7 +10300,7 @@
                 "QueueSobject",
                 "Queue"
             ],
-            "metadata_types": []
+            "metadata_types": ["AssignmentRules"]
         },
         "instance_dict": {
             "queue_name": "VIP Support",
@@ -10337,7 +10347,7 @@
                 "QueueSobject",
                 "Queue"
             ],
-            "metadata_types": []
+            "metadata_types": ["AssignmentRules"]
         },
         "instance_dict": {
             "queue_name": "Marketing Support",
@@ -10383,7 +10393,7 @@
                 "QueueSobject",
                 "Queue"
             ],
-            "metadata_types": []
+            "metadata_types": ["AssignmentRules"]
         },
         "instance_dict": {
             "queue_name": "tech support specialists",
@@ -10430,7 +10440,7 @@
                 "QueueSobject",
                 "Queue"
             ],
-            "metadata_types": []
+            "metadata_types": ["AssignmentRules"]
         },
         "instance_dict": {
             "queue_name": "enterprise client support",

diff --git a/envs/remote_docker_env/vendor/OSworld/desktop_env/server/README.md b/envs/remote_docker_env/vendor/OSworld/desktop_env/server/README.md
@@ -369,18 +369,7 @@ sudo systemctl start osworld_server.service
 
 ### Accessibility Tree Support
 
-To support the accessibility tree functionality, you'll need to install pyastpi2 in your Ubuntu environment. This package enables access to accessibility information and tree structures.
-
-Installation steps:
-
-```bash
-# Update package list and ensure pip is installed
-sudo apt-get update
-sudo apt-get install python3-pip
-
-# Install pyastpi2 using pip
-pip3 install pyastpi2
-```
+Due to Salesforce security requirements, we’re unable to provide the solution here. Please refer to the instructions in the original repository and proceed at your own discretion.
 
 ### Xorg Configuration
 

diff --git a/main_bu.py b/main_bu.py
@@ -17,11 +17,10 @@
 import glob
 from playwright.async_api import async_playwright
 
-from utils import run_evaluate, run_reset, LogFormatter, split_task_config_pool_into_batches
+from utils import run_evaluate, run_reset, LogFormatter, split_task_config_pool_into_batches, capture_logs_to_file
 from args import get_args
 
 from scuba.phases.evaluation.master_evaluator import MilestoneEvaluator
-from scuba.phases.resetter import Resetter
 from scuba.helpers.salesforce_commands import authorize_using_access_token, install_initial_data, retrieve_initial_state_metadata, create_project_if_not_exists
 # build env and agent
 from browser_use import Controller
@@ -203,7 +202,7 @@ async def replan(param_model:NoParamsAction, browser: BrowserContextBugFix) -> A
 
         async with asyncio.Lock():
             try:
-                evaluator = MilestoneEvaluator(args.org_alias)
+
                 # breakpoint()
                 score_card = evaluator.evaluate_instance(task_instance_dict, agent_answer)
                 evaluation_result = score_card.__dict__()
@@ -339,13 +338,7 @@ async def test(args: argparse.Namespace, task_config_pool: List[Dict]) -> None:
         total_batches = len(task_config_pool_batches)
         logger.info(f"Split the task_config_pool into {total_batches} batches due to constraints and dependencies of different tasks")
 
-        if args.reset_orgs_before_eval:
-            # Since the reset and evaluation are based on local files; we need to reset the salesforce orgs first
-            logger.info(f"Bulk resetting the salesforce orgs...")
-            time_start = time.perf_counter()
-            run_reset(task_config_pool, args.org_alias)
-            time_end = time.perf_counter()
-            logger.info(f"Done bulk resetting the salesforce orgs in {time_end - time_start:.2f} seconds")
+
         if args.solutions == 'bu':
             # build auxilary components
             retriever = None
@@ -374,6 +367,15 @@ async def test(args: argparse.Namespace, task_config_pool: List[Dict]) -> None:
         for batch_idx, task_config_pool in enumerate(task_config_pool_batches):
             num_tasks = len(task_config_pool)
             logger.info(f"Starting batch {batch_idx} with {num_tasks} tasks")
+            # Since the reset and evaluation are based on local files; we need to reset the salesforce orgs first
+            logger.info(f"Bulk resetting the salesforce orgs...")
+            time_start=time.perf_counter()
+            file=os.path.join(args.result_dir,"reset.log")
+            with capture_logs_to_file(file):
+                run_reset(task_config_pool, args.org_alias)
+
+            time_end=time.perf_counter()
+            logger.info(f"Done bulk resetting the salesforce orgs in {time_end-time_start:.2f} seconds")
             semaphore = asyncio.Semaphore(args.max_concurrent_tasks)
             job_queue = []
             for task_instance_dict in task_config_pool:
@@ -410,6 +412,7 @@ def get_unfinished_task_ids(task_instance_dicts: List[Dict], target_dir: str):
 if __name__ == '__main__':
     args = get_args()
     assert args.org_alias == os.getenv("ORG_ALIAS"), f"org_alias: {args.org_alias} is not the same as the org_alias in the .env file: {os.getenv('ORG_ALIAS')}. The one in the .env file is used to login in the remote desktop environment."
+    evaluator=MilestoneEvaluator(args.org_alias)
     args.result_dir = os.path.join(args.result_dir, args.run_name)
 
     assert args.total_desired_envs == args.max_concurrent_tasks, f"total_desired_envs: {args.total_desired_envs} is not the same as max_concurrent_tasks: {args.max_concurrent_tasks}"