From e9bfab01faa68b627864218a09b1be19857d24d3 Mon Sep 17 00:00:00 2001 From: Krithika Ramakrishnan Date: Tue, 4 Nov 2025 14:08:24 -0500 Subject: [PATCH 01/21] Fix GT for admin_001_002 --- data/test_demo_aug.json | 2 +- data/test_zero_shot.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/data/test_demo_aug.json b/data/test_demo_aug.json index bdf598f..b15a2d0 100644 --- a/data/test_demo_aug.json +++ b/data/test_demo_aug.json @@ -62,7 +62,7 @@ "\"100,000\"" ] ], - "field_name": "Amount" + "field_name": "AMOUNT" }, "difficulty": "medium", "query_template_name": "create_report", diff --git a/data/test_zero_shot.json b/data/test_zero_shot.json index 38142fd..36a214f 100644 --- a/data/test_zero_shot.json +++ b/data/test_zero_shot.json @@ -61,7 +61,7 @@ "\"100,000\"" ] ], - "field_name": "Amount" + "field_name": "AMOUNT" }, "difficulty": "medium", "query_template_name": "create_report", From c94c8846735d4c8a8d0208209f61c84933f26600 Mon Sep 17 00:00:00 2001 From: Krithika Ramakrishnan Date: Tue, 4 Nov 2025 14:10:11 -0500 Subject: [PATCH 02/21] Fix GT issue in admin_001_003 --- data/test_demo_aug.json | 2 +- data/test_zero_shot.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/data/test_demo_aug.json b/data/test_demo_aug.json index b15a2d0..c01f455 100644 --- a/data/test_demo_aug.json +++ b/data/test_demo_aug.json @@ -98,7 +98,7 @@ "human_review_comment": "rejected since it is the same. Rewriten", "ground_truth_dict": { "object_name": "Accounts", - "report_name": "ActiveAccountsRPT", + "report_name": "MyAccountsRPT", "filter_type": "standard filter", "filters": [ [ diff --git a/data/test_zero_shot.json b/data/test_zero_shot.json index 36a214f..458fb11 100644 --- a/data/test_zero_shot.json +++ b/data/test_zero_shot.json @@ -96,7 +96,7 @@ "human_review_comment": "rejected since it is the same. Rewriten", "ground_truth_dict": { "object_name": "Accounts", - "report_name": "ActiveAccountsRPT", + "report_name": "MyAccountsRPT", "filter_type": "standard filter", "filters": [ [ From e46733d57e9792db4fdc17c2a4475d8d52ab95dc Mon Sep 17 00:00:00 2001 From: Krithika Ramakrishnan Date: Tue, 4 Nov 2025 14:30:05 -0500 Subject: [PATCH 03/21] Fix prerequisite issue for sales_005_005 --- scuba/prerequisites/data/bulk_data/Opportunity.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scuba/prerequisites/data/bulk_data/Opportunity.json b/scuba/prerequisites/data/bulk_data/Opportunity.json index c375b30..3cab442 100644 --- a/scuba/prerequisites/data/bulk_data/Opportunity.json +++ b/scuba/prerequisites/data/bulk_data/Opportunity.json @@ -4,7 +4,7 @@ {"Name": "Product Development", "StageName": "Closed Won", "Amount": 300000, "CloseDate": "2025-02-10"}, {"Name": "Consulting Service", "StageName": "Closed Lost", "Amount": 40000, "CloseDate": "2025-01-18"}, {"Name": "Maintenance Contract", "StageName": "Qualification", "Amount": 50000, "CloseDate": "2025-06-30"}, - {"Name": "Security Solutions", "StageName": "Needs Analysis", "Amount": 60000, "CloseDate": "2025-05-14"}, + {"Name": "Security Solutions", "StageName": "Qualification", "Amount": 60000, "CloseDate": "2025-05-14"}, {"Name": "IT Outsourcing", "StageName": "Value Proposition", "Amount": 70000, "CloseDate": "2025-07-08"}, {"Name": "Digital Marketing", "StageName": "Perception Analysis", "Amount": 100000, "CloseDate": "2025-08-25"}, {"Name": "Environmental Consultancy", "StageName": "Identify Decision Makers", "Amount": 85000, "CloseDate": "2025-09-12"}, From 7be380c1936979fd94b8a2df7cc23584758fcfac Mon Sep 17 00:00:00 2001 From: Krithika Ramakrishnan Date: Tue, 4 Nov 2025 14:39:59 -0500 Subject: [PATCH 04/21] Add separate test scripts for evaluation, prerequisites installation and reset --- tests/test_evaluate.py | 54 ++++++++++++++++++++++++++++++++++++++ tests/test_prerequisite.py | 28 ++++++++++++++++++++ tests/test_resetter.py | 40 ++++++++++++++++++++++++++++ 3 files changed, 122 insertions(+) create mode 100644 tests/test_evaluate.py create mode 100644 tests/test_prerequisite.py create mode 100644 tests/test_resetter.py diff --git a/tests/test_evaluate.py b/tests/test_evaluate.py new file mode 100644 index 0000000..bf56899 --- /dev/null +++ b/tests/test_evaluate.py @@ -0,0 +1,54 @@ +import argparse +import json +import os +os.chdir('..') + +from scuba.phases.evaluation.master_evaluator import MilestoneEvaluator +from scuba.helpers.salesforce_commands import authorize_using_access_token + + +RED = "\033[91m" +RESET = "\033[0m" +DATA_FILE = 'data/test_zero_shot.json' + +def run_evaluator(task_instances, org_alias): + evaluator=MilestoneEvaluator(org_alias=org_alias) + for instance in task_instances: + print(f'Running evaluation on instance {instance["task_id"]}') + score = evaluator.evaluate_instance(instance, agent_answer=None) + print(f'{RED} {score.__dict__()} {RESET}') + +def get_instances_for_task(task_id): + task_instances=json.load(open(DATA_FILE)) + task_instances=[item for item in task_instances if item['task_id']==task_id] + return task_instances + +def get_instances_for_template(template_id): + task_instances=json.load(open(DATA_FILE)) + task_instances=[item for item in task_instances if item['query_template_metadata']['template_id']==template_id] + return task_instances + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Run evaluation on task instances') + parser.add_argument('--org-alias', type=str, required=True, + help='Organization alias for Salesforce authorization') + parser.add_argument('--task-id', type=str, default=None, + help='Task ID to filter instances') + parser.add_argument('--template-id', type=str, default=None, + help='Template ID to filter instances') + + args = parser.parse_args() + + org_alias = args.org_alias + authorize_using_access_token(org_alias) + + task_id = args.task_id + template_id = args.template_id + + if task_id: + task_instances = get_instances_for_task(task_id) + elif template_id: + task_instances = get_instances_for_template(template_id) + else: + task_instances = [] + run_evaluator(task_instances, org_alias) diff --git a/tests/test_prerequisite.py b/tests/test_prerequisite.py new file mode 100644 index 0000000..fe292c8 --- /dev/null +++ b/tests/test_prerequisite.py @@ -0,0 +1,28 @@ +import argparse +import json +import os +os.chdir('..') +from scuba.phases.prerequisites import Prerequisites +from scuba.helpers.salesforce_commands import authorize_using_access_token + +data = json.load(open('data/test_zero_shot.json')) +data_map = {item['task_id']: item for item in data} + +def install_preprequisites_for_task(task_id, org_alias): + config = data_map[task_id]['query_template_metadata'] + prerequisites = config['prerequisites'] + Prerequisites(org_alias, prerequisites=prerequisites).install_prerequisites() + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Install prerequisites for a task') + parser.add_argument('--task-id', type=str, required=True, + help='Task ID to install prerequisites for') + parser.add_argument('--org-alias', type=str, required=True, + help='Organization alias for Salesforce') + + args = parser.parse_args() + + task_id = args.task_id + org_alias = args.org_alias + authorize_using_access_token(org_alias) + install_preprequisites_for_task(task_id, org_alias) \ No newline at end of file diff --git a/tests/test_resetter.py b/tests/test_resetter.py new file mode 100644 index 0000000..3421d09 --- /dev/null +++ b/tests/test_resetter.py @@ -0,0 +1,40 @@ +import argparse +import json +from tqdm import tqdm +import os +os.chdir('..') +from scuba.phases.resetter import Resetter +from scuba.helpers.salesforce_commands import authorize_using_access_token + + +data = json.load(open('data/test_zero_shot.json')) +data_map = {item['task_id']: item for item in data} + +def run_reset_for_task(task_id, org_alias): + config = data_map[task_id]['query_template_metadata'] + metadata = config['metadata_types'] + objects = config['objects'] + resetter = Resetter(org_alias, metadata_types=metadata, objects=objects, prerequisites={}) + resetter.reset() + +def run_reset_for_tasks(task_list, org_alias): + for task in tqdm(task_list): + run_reset_for_task(task, org_alias) + +def run_reset_for_object(object_name, org_alias): + resetter = Resetter(org_alias, metadata_types={}, objects=[object_name], prerequisites={}) + resetter.reset() + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Reset Salesforce org state') + parser.add_argument('--org-alias', type=str, required=True, + help='Organization alias for Salesforce') + parser.add_argument('--task-id', type=str, required=True, + help='Task ID to reset') + + args = parser.parse_args() + + org_alias = args.org_alias + task_id = args.task_id + authorize_using_access_token(org_alias) + run_reset_for_task(task_id, org_alias) \ No newline at end of file From 43f520a75129ed7a54cae29bd9e59a76ad904d56 Mon Sep 17 00:00:00 2001 From: Krithika Ramakrishnan Date: Thu, 6 Nov 2025 12:10:42 -0500 Subject: [PATCH 05/21] Clarify the error messages in admin_032 --- scuba/phases/evaluation/milestone_evaluator_admin_new.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/scuba/phases/evaluation/milestone_evaluator_admin_new.py b/scuba/phases/evaluation/milestone_evaluator_admin_new.py index 2db7dd4..a5ae481 100644 --- a/scuba/phases/evaluation/milestone_evaluator_admin_new.py +++ b/scuba/phases/evaluation/milestone_evaluator_admin_new.py @@ -113,9 +113,14 @@ def evaluate_template_create_global_value_set(self, data: Dict[str, Any], **kwar 'weight': 0.3 }, { - 'milestone': f'Add the list of items correctly to the global value set', + 'milestone': f'Add the list items correctly to the global value set', + 'is_success': set(items) == set([val.strip() for val in params.comma_separated_values.split(',')]), + 'weight': 0.4 + }, + { + 'milestone': f'Maintain correct order of list items', 'is_success': items == [val.strip() for val in params.comma_separated_values.split(',')], - 'weight': 0.7 + 'weight': 0.3 } ] return milestones From b7dd7380aa4bc60d724b2cc6536adfdad6f90334 Mon Sep 17 00:00:00 2001 From: Krithika Ramakrishnan Date: Thu, 6 Nov 2025 12:11:48 -0500 Subject: [PATCH 06/21] Replace bulk delete with parallel single deletes to get failure ack --- scuba/phases/resetter.py | 37 ++++++++++++++++++------------------- 1 file changed, 18 insertions(+), 19 deletions(-) diff --git a/scuba/phases/resetter.py b/scuba/phases/resetter.py index 01664bc..ab28ad7 100644 --- a/scuba/phases/resetter.py +++ b/scuba/phases/resetter.py @@ -65,6 +65,16 @@ def __reset_validation_rule(self): url = raw_response.get('records')[0]['attributes']['url'] delete(self.org_alias, url) + def __bulk_delete(self, object_name, record_ids): + username = get_org_info(self.org_alias)['username'] + threads = [] + for id in record_ids: + delete_command=f'sf data delete record --sobject {object_name} --record-id {id} -o {username}' + thread=threading.Thread(target=execute_sfdx_command,args=(delete_command,)) + threads.append(thread) + thread.start() + for thread in tqdm(threads,desc="Deleting records"): + thread.join() def __reset_data(self): """ @@ -105,22 +115,7 @@ def __reset_data(self): new_ids = set(new_df['Id'].values.tolist()) print(f'Found {len(new_ids)} new IDs in {o} object.') if len(new_ids) > 0: - new_df[new_df['Id'].isin(new_ids)][['Id']].to_csv(f'new_{o}.csv', index=False) - username = get_org_info(self.org_alias)['username'] - if o in ['Queue', 'Knowledge__ka']: - threads = [] - if o == 'Queue': - sobject_type = 'Group' - else: - sobject_type = o - for id in new_ids: - bulk_delete_command = f'sf data delete record --sobject {sobject_type} --record-id {id} -o {username}' - thread = threading.Thread(target=execute_sfdx_command, args=(bulk_delete_command,)) - threads.append(thread) - thread.start() - for thread in tqdm(threads, desc="Deleting records"): - thread.join() - elif o == 'UserLogin': + if o == 'UserLogin': threads = [] for id in new_ids: endpoint = f'/services/data/v62.0/sobjects/UserLogin/{id}' @@ -129,9 +124,13 @@ def __reset_data(self): thread.start() for thread in tqdm(threads, desc="Patching records"): thread.join() + continue + if o == 'Queue': + sobject_type = 'Group' else: - bulk_delete_command = f'sf data delete bulk --sobject {o} --file new_{o}.csv -o {username}' - execute_sfdx_command(bulk_delete_command) + sobject_type = o + + self.__bulk_delete(sobject_type, new_ids) # Find and patch modified Ids if old_data is not None: @@ -164,7 +163,7 @@ def __deploy_diff(self): for type in self.metadata_types: if type in ['ListView', 'MatchingRule']: - query = f'SELECT SObjectType, DeveloperName FROM {type} WHERE SystemModstamp >= LAST_N_DAYS:10' + query = f'SELECT SObjectType, DeveloperName FROM {type} WHERE SystemModstamp >= LAST_N_DAYS:20' run_query(query, type, self.org_alias) try: df = pd.read_csv(f'{type}.csv') From c4c5103548033d1ece96f9601da02c92f8978c73 Mon Sep 17 00:00:00 2001 From: Krithika Ramakrishnan Date: Thu, 6 Nov 2025 12:12:54 -0500 Subject: [PATCH 07/21] Remove redundant evaluation testing code --- scuba/phases/evaluation/master_evaluator.py | 23 --------------------- 1 file changed, 23 deletions(-) diff --git a/scuba/phases/evaluation/master_evaluator.py b/scuba/phases/evaluation/master_evaluator.py index 41dee06..dbb143b 100644 --- a/scuba/phases/evaluation/master_evaluator.py +++ b/scuba/phases/evaluation/master_evaluator.py @@ -116,26 +116,3 @@ def evaluate_instance(self, instance, agent_answer=None): failures = traceback.format_exc().splitlines() return ScoreCard(-1, [], failures) -if __name__ == '__main__': - from scuba.helpers.salesforce_commands import authorize_using_access_token - RED = "\033[91m" - RESET = "\033[0m" - - task_instances = json.load(open('data/test_zero_shot.json')) - template = None - # template = 'admin_009' - instance = None - instance = 'admin_008_003' - if template: - task_instances = [item for item in task_instances if item['query_template_metadata']['template_id'] == template] - elif instance: - task_instances = [item for item in task_instances if item['task_id'] == instance] - org_alias = 'YDCRMGUI' - - authorize_using_access_token(org_alias) - evaluator = MilestoneEvaluator(org_alias=org_alias) - for instance in task_instances: - print(f'Running evaluation on instance {instance["task_id"]}') - score = evaluator.evaluate_instance(instance, agent_answer=None) - print(f'{RED} {score.__dict__()} {RESET}') - From 31a6c3ec7414132b75feaa5b9bd914f03b06e37d Mon Sep 17 00:00:00 2001 From: Krithika Ramakrishnan Date: Thu, 6 Nov 2025 14:32:34 -0500 Subject: [PATCH 08/21] Fix exact description matching issue in sales_005 --- .../evaluation/milestone_evaluator_sales.py | 24 ++++++++++++++++--- 1 file changed, 21 insertions(+), 3 deletions(-) diff --git a/scuba/phases/evaluation/milestone_evaluator_sales.py b/scuba/phases/evaluation/milestone_evaluator_sales.py index d106a59..d0f516d 100644 --- a/scuba/phases/evaluation/milestone_evaluator_sales.py +++ b/scuba/phases/evaluation/milestone_evaluator_sales.py @@ -5,6 +5,10 @@ import types from datetime import datetime, timedelta from typing import List, Dict, Any, Union +import gensim.downloader as api +from numpy import dot +from numpy.linalg import norm +import numpy as np from dateutil.relativedelta import relativedelta from scuba.phases.base_phase import BasePhase @@ -12,6 +16,7 @@ class MilestoneEvaluator(BasePhase): def __init__(self, org_alias): super().__init__(org_alias) + self.model = api.load("glove-wiki-gigaword-100") def evaluate_template_create_account_and_contact(self, data: Dict[str, Any], **kwargs) -> List[Dict[str, Any]]: params = types.SimpleNamespace(**kwargs) @@ -292,6 +297,19 @@ def evaluate_template_convert_lead(self, data: Dict[str, Any], **kwargs) -> List ] return milestones + def __sentence_vector(self, sentence): + words=[w for w in sentence.lower().split() if w in self.model] + return np.mean([self.model[w] for w in words],axis=0) + + def __fuzzy_match(self, string1, string2): + if string1 is None or string2 is None: + return False + v1, v2 = self.__sentence_vector(string1), self.__sentence_vector(string2) + similarity = dot(v1, v2) / (norm(v1) * norm(v2)) + if similarity > 0.8: + return True + return False + def evaluate_template_update_opportunity_stage_and_activity(self, data: Dict[str, Any], **kwargs) -> List[Dict[str, Any]]: params = types.SimpleNamespace(**kwargs) opportunity_records = data['opportunity_info'].records @@ -303,13 +321,13 @@ def evaluate_template_update_opportunity_stage_and_activity(self, data: Dict[str if params.activity_type == 'Task' or params.activity_type == 'Email': activity_type_correct = activity_exists and activity_records[0]['TaskSubtype'] == params.activity_type - activity_description_correct = activity_exists and str(activity_records[0]['Subject']) == params.activity_description + activity_description_correct = activity_exists and self.__fuzzy_match(activity_records[0]['Subject'], params.activity_description) elif params.activity_type == 'Call': activity_type_correct = activity_exists and activity_records[0]['TaskSubtype'] == params.activity_type - activity_description_correct = activity_exists and str(activity_records[0]['Description']).lower() == params.activity_description.lower() + activity_description_correct = activity_exists and self.__fuzzy_match(str(activity_records[0]['Description']).lower(), params.activity_description.lower()) elif params.activity_type == 'Event': activity_type_correct = event_exists and event_records[0]['EventSubtype'] == params.activity_type - activity_description_correct = event_exists and str(event_records[0]['Subject']) == params.activity_description + activity_description_correct = event_exists and self.__fuzzy_match(str(event_records[0]['Subject']), params.activity_description) else: activity_type_correct = False activity_description_correct = False From a396dcca7973854e3bb01263dd0a2bf58ff5813f Mon Sep 17 00:00:00 2001 From: Krithika Ramakrishnan Date: Thu, 6 Nov 2025 21:28:11 -0500 Subject: [PATCH 09/21] Move from print statements to logging, instantiate evaluators only once --- main_bu.py | 11 +++-- scuba/helpers/salesforce_commands.py | 67 ++++++++++++++++------------ scuba/helpers/utils.py | 12 +++-- scuba/phases/prerequisites.py | 18 +++++--- scuba/phases/resetter.py | 19 ++++---- utils.py | 30 +++++++++++-- 6 files changed, 102 insertions(+), 55 deletions(-) diff --git a/main_bu.py b/main_bu.py index 7c07de9..92ae5bd 100644 --- a/main_bu.py +++ b/main_bu.py @@ -17,11 +17,10 @@ import glob from playwright.async_api import async_playwright -from utils import run_evaluate, run_reset, LogFormatter, split_task_config_pool_into_batches +from utils import run_evaluate, run_reset, LogFormatter, split_task_config_pool_into_batches, capture_logs_to_file from args import get_args from scuba.phases.evaluation.master_evaluator import MilestoneEvaluator -from scuba.phases.resetter import Resetter from scuba.helpers.salesforce_commands import authorize_using_access_token, install_initial_data, retrieve_initial_state_metadata, create_project_if_not_exists # build env and agent from browser_use import Controller @@ -203,7 +202,7 @@ async def replan(param_model:NoParamsAction, browser: BrowserContextBugFix) -> A async with asyncio.Lock(): try: - evaluator = MilestoneEvaluator(args.org_alias) + # breakpoint() score_card = evaluator.evaluate_instance(task_instance_dict, agent_answer) evaluation_result = score_card.__dict__() @@ -343,7 +342,10 @@ async def test(args: argparse.Namespace, task_config_pool: List[Dict]) -> None: # Since the reset and evaluation are based on local files; we need to reset the salesforce orgs first logger.info(f"Bulk resetting the salesforce orgs...") time_start = time.perf_counter() - run_reset(task_config_pool, args.org_alias) + file = os.path.join(args.result_dir, "reset.log") + with capture_logs_to_file(file): + run_reset(task_config_pool, args.org_alias) + time_end = time.perf_counter() logger.info(f"Done bulk resetting the salesforce orgs in {time_end - time_start:.2f} seconds") if args.solutions == 'bu': @@ -410,6 +412,7 @@ def get_unfinished_task_ids(task_instance_dicts: List[Dict], target_dir: str): if __name__ == '__main__': args = get_args() assert args.org_alias == os.getenv("ORG_ALIAS"), f"org_alias: {args.org_alias} is not the same as the org_alias in the .env file: {os.getenv('ORG_ALIAS')}. The one in the .env file is used to login in the remote desktop environment." + evaluator=MilestoneEvaluator(args.org_alias) args.result_dir = os.path.join(args.result_dir, args.run_name) assert args.total_desired_envs == args.max_concurrent_tasks, f"total_desired_envs: {args.total_desired_envs} is not the same as max_concurrent_tasks: {args.max_concurrent_tasks}" diff --git a/scuba/helpers/salesforce_commands.py b/scuba/helpers/salesforce_commands.py index 7c58ba8..8b22138 100644 --- a/scuba/helpers/salesforce_commands.py +++ b/scuba/helpers/salesforce_commands.py @@ -4,10 +4,15 @@ import pandas as pd import json import time +import logging import shutil from tqdm import tqdm import subprocess from scuba.helpers.utils import get_org_info + +logger = logging.getLogger(__name__) +logger.propagate = True + GREEN = "\033[92m" RED = "\033[91m" YELLOW = "\033[93m" @@ -35,7 +40,7 @@ def get_access_token(org_alias: str): access_token = response.json()['access_token'] return access_token except Exception as exc: - print(f'Authorization failed with exception: {exc}') + logger.info(f'Authorization failed with exception: {exc}') def get(org_alias: str, endpoint: str, access_token: str=None, instance: str=None): if access_token is None: @@ -65,14 +70,20 @@ def post(org_alias:str, endpoint: str, data: dict, access_token: str=None, insta url = instance + endpoint response = requests.post(url, headers=headers, json=data) - print(response.json()) + if response.status_code < 400: + return True, None + else: + return False, response.json() def patch(org_alias:str, endpoint: str, data: dict): headers = {'Content-Type': 'application/json', 'Authorization': 'Bearer ' + get_access_token(org_alias)} instance = get_org_info(org_alias)['instance'] url = instance + endpoint response = requests.patch(url, headers=headers, json=data) - print(response.text) + if response.status_code<400: + return True, None + else: + return False, response.json() def authorize_using_access_token(org_alias: str): @@ -84,7 +95,7 @@ def authorize_using_access_token(org_alias: str): stdout, stderr = execute_sfdx_command(login_command, env=env) if stderr != '': raise RuntimeError(f'{RED}Login failed with: {stderr}{RESET}') - print(f'{GREEN}Login successful for the org: {org_alias}{RESET}') + logger.info(f'{GREEN}Login successful for the org: {org_alias}{RESET}') def execute_sfdx_command(command: str, cwd: str=None, env=None): @@ -97,7 +108,7 @@ def execute_sfdx_command(command: str, cwd: str=None, env=None): Returns: str: The output of the command. """ - print(f"Executing command: {command}") + logger.info(f"Executing command: {command}") if env is None: env = os.environ.copy() env.update({ @@ -107,9 +118,9 @@ def execute_sfdx_command(command: str, cwd: str=None, env=None): stdout = output.stdout stderr = output.stderr # if stderr != '': - # print(f'{RED}Command failed with: {stderr}{RESET}') - # print(f'\t\tOutput: {output.stdout}') - # print(f'\t\tError: {output.stderr}') + # logger.info(f'{RED}Command failed with: {stderr}{RESET}') + # logger.info(f'\t\tOutput: {output.stdout}') + # logger.info(f'\t\tError: {output.stderr}') return output.stdout, output.stderr @@ -122,11 +133,11 @@ def create_project_if_not_exists(folder_path: str, org_alias: str): org_alias (str): The alias of the organization to retrieve metadata from. """ if not os.path.exists(folder_path): - print(f"Creating project for {org_alias} into {folder_path}.") + logger.info(f"Creating project for {org_alias} into {folder_path}.") project_name = os.path.basename(folder_path) generate_project_command = f"sf project generate --name {project_name} --output-dir {os.path.dirname(folder_path)}" execute_sfdx_command(generate_project_command) - print(f"Project generation complete for {org_alias}.") + logger.info(f"Project generation complete for {org_alias}.") os.makedirs(os.path.join(folder_path, "manifest"), exist_ok=True) def retrieve_initial_state_metadata(org_alias: str): @@ -138,9 +149,9 @@ def retrieve_initial_state_metadata(org_alias: str): generate_manifest_command = f"sf project generate manifest --from-org {username} --output-dir manifest" execute_sfdx_command(generate_manifest_command, cwd=folder_path) retrieve_latest_metadata(folder_path, org_alias) - print(f"{GREEN}Retrieved initial state metadata for {org_alias} into {folder_path}.{RESET}") + logger.info(f"{GREEN}Retrieved initial state metadata for {org_alias} into {folder_path}.{RESET}") else: - print(f"{YELLOW}Initial state metadata for {org_alias} already exists in {folder_path}.{RESET}") + logger.info(f"{YELLOW}Initial state metadata for {org_alias} already exists in {folder_path}.{RESET}") def retrieve_latest_metadata(folder_path: str, org_alias: str): @@ -151,16 +162,16 @@ def retrieve_latest_metadata(folder_path: str, org_alias: str): folder_path (str): The path to the folder containing the metadata. org_alias (str): The alias of the organization to retrieve metadata from. """ - print(f"Retrieving metadata for {org_alias} into {folder_path}.") + logger.info(f"Retrieving metadata for {org_alias} into {folder_path}.") username = get_org_info(org_alias)['username'] start_time = time.time() retrieve_command = f"sf project retrieve start --manifest manifest/package.xml -o {username}" execute_sfdx_command(retrieve_command, cwd=folder_path) end_time = time.time() - print(f"Retrieved latest metadata for {org_alias} to {folder_path} in {end_time - start_time} seconds.") + logger.info(f"Retrieved latest metadata for {org_alias} to {folder_path} in {end_time - start_time} seconds.") def deploy(folder_path: str, org_alias: str): - print(f"Deploying changes for {org_alias} from {folder_path}.") + logger.info(f"Deploying changes for {org_alias} from {folder_path}.") username = get_org_info(org_alias)['username'] start_time = time.time() deploy_command = f"sf project deploy start --manifest manifest/package.xml --post-destructive-changes manifest/destructiveChanges.xml --ignore-errors -o {username}" @@ -168,10 +179,10 @@ def deploy(folder_path: str, org_alias: str): if 'Component Failures' in output: raise DeployError(output) end_time = time.time() - print(f"Deployed changes to {org_alias} in {end_time - start_time} seconds.") + logger.info(f"Deployed changes to {org_alias} in {end_time - start_time} seconds.") def run_query(query: str, nickname: str, org_alias: str): - print(f"Running query: {query}") + logger.info(f"Running query: {query}") username = get_org_info(org_alias)['username'] start_time = time.time() query_command = f"sf data query --query \"{query}\" --output-file {nickname}.csv --result-format csv -o {username}" @@ -180,37 +191,37 @@ def run_query(query: str, nickname: str, org_alias: str): if errors != 'Querying Data... done\n': raise RuntimeError(f'Query failed with {errors}') end_time = time.time() - print(f"Query results saved in {nickname}.csv in {end_time - start_time} seconds.") + logger.info(f"Query results saved in {nickname}.csv in {end_time - start_time} seconds.") def run_query_json(query: str, org_alias: str): - print(f"Running query: {query}") + logger.info(f"Running query: {query}") username = get_org_info(org_alias)['username'] start_time = time.time() query_command = f"sf data query --query \"{query}\" --json -o {username}" stdout, stderr = execute_sfdx_command(query_command) end_time = time.time() - print(f"Query executed in {end_time - start_time} seconds.") + logger.info(f"Query executed in {end_time - start_time} seconds.") return json.loads(stdout) def update_record(object: str, record_id: str, key: str, value: str, org_alias: str): - print(f"Updating record: {object}:{record_id}:{key}:{value}") + logger.info(f"Updating record: {object}:{record_id}:{key}:{value}") username = get_org_info(org_alias)['username'] start_time = time.time() command = f"sf data update record --sobject {object} --record-id {record_id} --key {key} --values \"{key}={value}\" -o {username}" execute_sfdx_command(command) end_time = time.time() - print(f"Updated record in {end_time-start_time} seconds.") + logger.info(f"Updated record in {end_time-start_time} seconds.") def download_initial_csv(org_alias, object, destination_filename): - print(f"Downloading initial CSV for {object}.") + logger.info(f"Downloading initial CSV for {object}.") query = f'SELECT FIELDS(ALL) FROM {object} LIMIT 200' run_query(query, object, org_alias) if os.path.exists(f'{object}.csv'): shutil.move(f'{object}.csv', destination_filename) def install_initial_data(org_alias, instances): - print(f"Downloading initial data (if not already found) for {org_alias}.") + logger.info(f"Downloading initial data (if not already found) for {org_alias}.") all_objects = set() initial_data_directory = os.path.join('initial_data', org_alias) os.makedirs(initial_data_directory, exist_ok=True) @@ -223,18 +234,18 @@ def install_initial_data(org_alias, instances): json_filepath = os.path.join(initial_data_directory, f'{object}.json') if not os.path.exists(json_filepath): endpoint = f"/services/data/v64.0/sobjects/{object}/describe/" - print(f"Pulling object description for {object}") + logger.info(f"Pulling object description for {object}") object_description = get(org_alias=org_alias, endpoint=endpoint) json.dump(object_description, open(json_filepath, 'w')) destination_filename = os.path.join(initial_data_directory, f'{object}.csv') if not os.path.exists(destination_filename): download_initial_csv(org_alias, object, destination_filename) - print(f"{GREEN}Downloading initial data complete.{RESET}") + logger.info(f"{GREEN}Downloading initial data complete.{RESET}") def does_data_exist(object: str, unique_keys_and_vals: dict, org_alias: str): query_pairs = [f"{key}='{value}'" for key, value in unique_keys_and_vals.items()] query_string = "AND ".join(query_pairs) - print(f"Checking data exists: {object}") + logger.info(f"Checking data exists: {object}") query = f"SELECT FIELDS(ALL) FROM {object} WHERE {query_string} LIMIT 5" nickname = f"check_{object}_exists_{random.randint(100,900)}" run_query(query, nickname, org_alias) @@ -259,4 +270,4 @@ def does_data_exist(object: str, unique_keys_and_vals: dict, org_alias: str): try: deploy(f'orgs/modified_state/{org_alias}', org_alias) except DeployError as exc: - print(f'Traceback: {traceback.format_exc()}') + logger.info(f'Traceback: {traceback.format_exc()}') diff --git a/scuba/helpers/utils.py b/scuba/helpers/utils.py index 618ed59..ba8d44f 100644 --- a/scuba/helpers/utils.py +++ b/scuba/helpers/utils.py @@ -1,11 +1,15 @@ import json import os +import logging import string import re import xmltodict from jsondiff import diff from dict2xml import dict2xml +logger = logging.getLogger(__name__) +logger.propagate = True + # Org Details utils orgs_info = json.load(open("orgs/orgs_info.json")) def get_org_info(org_alias: str): @@ -33,7 +37,7 @@ def create_metadata_info_xml(types_and_members: dict, manifest_folder: str, is_d filename = 'destructiveChanges.xml' else: filename = 'package.xml' - print(f'Writing {types_and_members} to {filename}.') + logger.info(f'Writing {types_and_members} to {filename}.') with open(os.path.join(manifest_folder, filename), 'w') as f: f.write(xml_package) @@ -116,7 +120,7 @@ def diff_xml(file1, file2): def compare_folders(folder_a, folder_b): - print(f"Comparing {folder_a} and {folder_b}.") + logger.info(f"Comparing {folder_a} and {folder_b}.") files_a = get_all_files(folder_a) files_b = get_all_files(folder_b) deleted_files = [] @@ -137,7 +141,7 @@ def compare_folders(folder_a, folder_b): if xml_diffs: modified_files.append(rel_path) except Exception as e: - print(f'Exception while comparing {rel_path}: {e}') - print(f"Found {len(new_files)} new files, {len(deleted_files)} files deleted, {len(modified_files)} files modified\n.") + logger.info(f'Exception while comparing {rel_path}: {e}') + logger.info(f"Found {len(new_files)} new files, {len(deleted_files)} files deleted, {len(modified_files)} files modified\n.") return new_files, deleted_files, modified_files diff --git a/scuba/phases/prerequisites.py b/scuba/phases/prerequisites.py index f656910..d272574 100644 --- a/scuba/phases/prerequisites.py +++ b/scuba/phases/prerequisites.py @@ -2,8 +2,9 @@ This file handles the prerequisites phase of the CRM benchmark pipeline. It contains functions to check and install prerequisites required for each benchmark scenario. """ +import traceback from datetime import datetime,timedelta - +import logging import pandas as pd import glob import random @@ -17,6 +18,8 @@ from scuba.helpers.utils import convert_type_to_folder_name, create_metadata_info_xml from scuba.helpers.salesforce_commands import deploy, post, patch, does_data_exist, authorize_using_access_token, create_project_if_not_exists, run_query +logger = logging.getLogger(__name__) +logger.propagate = True PREREQUISITES_FOLDER = 'scuba/prerequisites' object_unique_keys_map = { @@ -66,7 +69,7 @@ def __install_prerequisite_metadata(self): files = glob.glob(pattern_to_search, recursive=False) destination_dir = os.path.join(self.modified_metadata_details_dir, folder_name_for_type) if len(files) != 1 and member !='*': - print(f'There should be exactly one file matching {pattern_to_search}') + logger.info(f'There should be exactly one file matching {pattern_to_search}') else: if type != 'CustomField': os.makedirs(destination_dir, exist_ok=True) @@ -83,7 +86,10 @@ def __install_prerequisite_metadata(self): if package_changes_types_and_members: create_metadata_info_xml(package_changes_types_and_members, self.manifest_dir, is_destructive=False) create_metadata_info_xml({}, self.manifest_dir, is_destructive=True) - deploy(self.modified_orgs_dir, self.org_alias) + try: + deploy(self.modified_orgs_dir, self.org_alias) + except Exception as e: + logger.error(traceback.format_exc()) def __get_id_for_dependency(self, object_name, field, value_name): soql = f"SELECT Id FROM {object_name} WHERE {field} = {value_name}" @@ -110,10 +116,10 @@ def __post_record(self, object_name, record): data_exists, id = self.__check_prerequisities_in_existing_data(object_name, record) info = {key: record[key] for key in object_unique_keys_map.get(object_name)} if data_exists: - print(f'{object_name} {info} already exists in org {self.org_alias} with ID: {id}. Patching...') + logger.info(f'{object_name} {info} already exists in org {self.org_alias} with ID: {id}. Patching...') patch(self.org_alias, f'/services/data/v62.0/sobjects/{object_name}/{id}', record) else: - print(f'{object_name} {info} does not exist in org {self.org_alias}. Creating...') + logger.info(f'{object_name} {info} does not exist in org {self.org_alias}. Creating...') post(self.org_alias, f'/services/data/v62.0/sobjects/{object_name}', record) def __create_records(self, object_name, records): @@ -159,7 +165,7 @@ def __create_single_coupled_record(self, coupled_record, dependent_field): self.__post_record(object_name1, to_post) exists, id = self.__get_id_for_dependency(object_name1, 'Name', f'\'{coupled_record[object_name1]["Name"]}\'') if not exists: - print(f'Posting {object_name1}: {coupled_record[object_name1]["Name"]} failed.') + logger.info(f'Posting {object_name1}: {coupled_record[object_name1]["Name"]} failed.') return to_post = coupled_record[object_name2].copy() if type(to_post) == dict: diff --git a/scuba/phases/resetter.py b/scuba/phases/resetter.py index ab28ad7..dd8d86d 100644 --- a/scuba/phases/resetter.py +++ b/scuba/phases/resetter.py @@ -5,6 +5,7 @@ import os import json +import logging import shutil import traceback import threading @@ -19,7 +20,7 @@ from scuba.helpers.salesforce_commands import get, retrieve_latest_metadata, deploy, run_query, \ execute_sfdx_command, authorize_using_access_token, patch, delete, DeployError from scuba.phases.prerequisites import Prerequisites - +logger = logging.getLogger(__name__) class Resetter(BasePhase): def __init__(self, org_alias, metadata_types, objects, prerequisites): @@ -90,7 +91,7 @@ def __reset_data(self): try: run_query(query, object, self.org_alias) except Exception as e: - print(f'Querying object {object} failed with error: {traceback.format_exc()}') + logger.info(f'Querying object {object} failed with error: {traceback.format_exc()}') for o in self.objects: initial_data_directory = os.path.join('initial_data', self.org_alias) old_data_file = os.path.join(initial_data_directory, f'{o}.csv') @@ -104,7 +105,7 @@ def __reset_data(self): try: new_df = pd.read_csv(f'{o}.csv') except (EmptyDataError, FileNotFoundError) as e: - print(f'No data found for {o} object.') + logger.info(f'No data found for {o} object.') if os.path.exists(f'{o}.csv'): os.remove(f'{o}.csv') continue @@ -113,7 +114,7 @@ def __reset_data(self): new_ids = set(new_df['Id'].values.tolist()).difference(set(old_data['Id'].values.tolist())) else: new_ids = set(new_df['Id'].values.tolist()) - print(f'Found {len(new_ids)} new IDs in {o} object.') + logger.info(f'Found {len(new_ids)} new IDs in {o} object.') if len(new_ids) > 0: if o == 'UserLogin': threads = [] @@ -163,7 +164,7 @@ def __deploy_diff(self): for type in self.metadata_types: if type in ['ListView', 'MatchingRule']: - query = f'SELECT SObjectType, DeveloperName FROM {type} WHERE SystemModstamp >= LAST_N_DAYS:20' + query = f'SELECT SObjectType, DeveloperName FROM {type} WHERE SystemModstamp >= LAST_N_DAYS:20 AND LastModifiedBy.Username=\'{os.environ["SALESFORCE_USERNAME"]}\'' run_query(query, type, self.org_alias) try: df = pd.read_csv(f'{type}.csv') @@ -178,7 +179,7 @@ def __deploy_diff(self): try: deploy(self.modified_orgs_dir, self.org_alias) except DeployError as exc: - print(f'Failed to deploy {type}. Traceback: {traceback.format_exc()}') + logger.info(f'Failed to deploy {type}. Traceback: {traceback.format_exc()}') elif type == 'ValidationRule': self.__reset_validation_rule() elif type in ['Report']: @@ -212,7 +213,7 @@ def __deploy_diff(self): try: deploy(self.modified_orgs_dir, self.org_alias) except DeployError as exc: - print(f'Failed to deploy {type}. Traceback: {traceback.format_exc()}') + logger.info(f'Failed to deploy {type}. Traceback: {traceback.format_exc()}') destructive_changes_types_and_members.setdefault(type, []) destructive_changes_types_and_members[type].append(member_name) create_metadata_info_xml(destructive_changes_types_and_members, self.manifest_dir, is_destructive=True) @@ -220,7 +221,7 @@ def __deploy_diff(self): try: deploy(f'orgs/modified_state/{self.org_alias}', self.org_alias) except DeployError as exc: - print(f'Failed to deploy {type}. Traceback: {traceback.format_exc()}') + logger.info(f'Failed to deploy {type}. Traceback: {traceback.format_exc()}') to_remove = os.path.join(self.modified_metadata_details_dir, folder_name_for_type, file) if os.path.isfile(to_remove): os.remove(to_remove) @@ -240,7 +241,7 @@ def __deploy_diff(self): try: deploy(f'orgs/modified_state/{self.org_alias}', self.org_alias) except DeployError as exc: - print(f'Failed to deploy {type}. Traceback: {traceback.format_exc()}') + logger.info(f'Failed to deploy {type}. Traceback: {traceback.format_exc()}') if __name__ == '__main__': resetter = Resetter(org_alias='YDCRMGUI', metadata_types=["ValidationRule"], objects=[], prerequisites={}) diff --git a/utils.py b/utils.py index 2830300..b682ae3 100644 --- a/utils.py +++ b/utils.py @@ -5,7 +5,6 @@ import os from pathlib import Path from scuba.phases.evaluation.master_evaluator import MilestoneEvaluator -from scuba.phases.evaluation.master_evaluator import MilestoneEvaluator from scuba.phases.resetter import Resetter import traceback import time @@ -14,6 +13,29 @@ logger = logging.getLogger(__name__) +from contextlib import contextmanager + + +@contextmanager +def capture_logs_to_file(filename,level=logging.INFO): + logger=logging.getLogger() # root logger + logger.setLevel(level) + + # Create file handler + file_handler=logging.FileHandler(filename,mode='w') + formatter=logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') + file_handler.setFormatter(formatter) + + # Add handler + logger.addHandler(file_handler) + + try: + yield + finally: + # Remove handler after method finishes + logger.removeHandler(file_handler) + file_handler.close() + class LogFormatter(logging.Formatter): def format(self, record): if type(record.name) == str and record.name.startswith('scuba.'): @@ -126,7 +148,7 @@ def split_task_config_pool_into_batches(task_config_pool: List[Dict], args: argp def run_reset(batch, org_alias): - print(f'Resetting org {org_alias}') + logger.info(f'Resetting org {org_alias}') start = time.time() metadata_types = set() prerequisite_objects = [] @@ -144,7 +166,7 @@ def run_reset(batch, org_alias): for type, members in task_prerequisite_types_and_members.items(): prerequisite_types_and_members.setdefault(type, []) prerequisite_types_and_members[type].extend(members) - prerequisite_objects = list(set(prerequisite_objects)) + prerequisite_objects = list(dict.fromkeys(prerequisite_objects)) for key, values in prerequisite_types_and_members.items(): prerequisite_types_and_members[key] = list(set(values)) @@ -152,7 +174,7 @@ def run_reset(batch, org_alias): "metadata": {"types_and_members": prerequisite_types_and_members}} resetter = Resetter(org_alias, list(metadata_types), list(objects), prerequisites) resetter.reset() - print(f'Resetting and prerequisites installation completed in {time.time() - start} seconds.') + logger.info(f'Resetting and prerequisites installation completed in {time.time() - start} seconds.') def run_evaluate(task_instance_dict: dict, agent_answer: str, org_alias: str): try: From b7871e8dffdf48d4436f0b6d86dca917ee82c8aa Mon Sep 17 00:00:00 2001 From: Krithika Ramakrishnan Date: Thu, 6 Nov 2025 22:34:10 -0500 Subject: [PATCH 10/21] Mitigate sales_001 issue --- .../evaluation_data_retrieval_workflows_sales.json | 2 +- scuba/phases/evaluation/milestone_evaluator_sales.py | 12 +++++++++--- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/scuba/phases/evaluation/evaluation_data_retrieval_workflows_sales.json b/scuba/phases/evaluation/evaluation_data_retrieval_workflows_sales.json index 28f42df..8198b3d 100644 --- a/scuba/phases/evaluation/evaluation_data_retrieval_workflows_sales.json +++ b/scuba/phases/evaluation/evaluation_data_retrieval_workflows_sales.json @@ -11,7 +11,7 @@ "type": "soql", "id": "get_contact", "params": { - "query": "SELECT Id, AccountId, Name, Email, Phone, MobilePhone, HomePhone FROM Contact WHERE AccountId = '{retrieved_results['get_company_account'].records[0]['Id']}'" + "query": "SELECT Id, Account.Name, Name, Email, Phone, MobilePhone, HomePhone FROM Contact WHERE Name = '{contact_name}'" } } ], diff --git a/scuba/phases/evaluation/milestone_evaluator_sales.py b/scuba/phases/evaluation/milestone_evaluator_sales.py index d0f516d..c842b98 100644 --- a/scuba/phases/evaluation/milestone_evaluator_sales.py +++ b/scuba/phases/evaluation/milestone_evaluator_sales.py @@ -26,6 +26,7 @@ def evaluate_template_create_account_and_contact(self, data: Dict[str, Any], **k contact_data = data.get('get_contact') contact_exists = contact_data is not None and len(contact_data.records) > 0 + contact_linked_to_account = contact_exists and contact_data.records[0]['Account.Name'] == params.company_name correct_email = contact_exists and contact_data.records[0]['Email'] == params.email_address if params.email_address else True correct_name = contact_exists and contact_data.records[0]['Name'] == params.contact_name correct_phone = contact_exists and (contact_data.records[0]['Phone'] == params.phone_number or contact_data.records[0]['MobilePhone'] == params.phone_number or contact_data.records[0]['HomePhone'] == params.phone_number) if params.phone_number else True @@ -34,11 +35,16 @@ def evaluate_template_create_account_and_contact(self, data: Dict[str, Any], **k { "milestone": f"Create Account for {params.company_name}", "is_success": account_exists_with_correct_name, - "weight": 0.2 + "weight": 0.1 + }, + { + 'milestone': f"Create Contact {params.contact_name}", + 'is_success': contact_exists, + 'weight': 0.1 }, { - "milestone": f"Create contact for the same Account", - "is_success": contact_exists, + "milestone": f"Link contact to the created Account", + "is_success": contact_linked_to_account, "weight": step_weight }, { From 2527821485cb59a106eaa5488eecedc295099769 Mon Sep 17 00:00:00 2001 From: Krithika Ramakrishnan Date: Thu, 6 Nov 2025 23:45:18 -0500 Subject: [PATCH 11/21] Fix GT issue in admin_003_002 --- data/test_demo_aug.json | 2 +- data/test_zero_shot.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/data/test_demo_aug.json b/data/test_demo_aug.json index c01f455..4fc3411 100644 --- a/data/test_demo_aug.json +++ b/data/test_demo_aug.json @@ -538,7 +538,7 @@ "operator2": "equals", "value2": "Closed Lost", "field3": "AMOUNT", - "operator3": "greaterThan", + "operator3": "greaterOrEqual", "value3": "\"5,000\"", "field4": "TYPE", "operator4": "equals", diff --git a/data/test_zero_shot.json b/data/test_zero_shot.json index 458fb11..fb0a298 100644 --- a/data/test_zero_shot.json +++ b/data/test_zero_shot.json @@ -527,7 +527,7 @@ "operator2": "equals", "value2": "Closed Lost", "field3": "AMOUNT", - "operator3": "greaterThan", + "operator3": "greaterOrEqual", "value3": "\"5,000\"", "field4": "TYPE", "operator4": "equals", From a42e7b5f2092fac5afbac3e81d37500b58898ed6 Mon Sep 17 00:00:00 2001 From: Krithika Ramakrishnan Date: Mon, 10 Nov 2025 13:34:32 -0500 Subject: [PATCH 12/21] Reset every batch in main_bu --- main_bu.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/main_bu.py b/main_bu.py index 92ae5bd..473bb1a 100644 --- a/main_bu.py +++ b/main_bu.py @@ -338,16 +338,7 @@ async def test(args: argparse.Namespace, task_config_pool: List[Dict]) -> None: total_batches = len(task_config_pool_batches) logger.info(f"Split the task_config_pool into {total_batches} batches due to constraints and dependencies of different tasks") - if args.reset_orgs_before_eval: - # Since the reset and evaluation are based on local files; we need to reset the salesforce orgs first - logger.info(f"Bulk resetting the salesforce orgs...") - time_start = time.perf_counter() - file = os.path.join(args.result_dir, "reset.log") - with capture_logs_to_file(file): - run_reset(task_config_pool, args.org_alias) - time_end = time.perf_counter() - logger.info(f"Done bulk resetting the salesforce orgs in {time_end - time_start:.2f} seconds") if args.solutions == 'bu': # build auxilary components retriever = None @@ -376,6 +367,15 @@ async def test(args: argparse.Namespace, task_config_pool: List[Dict]) -> None: for batch_idx, task_config_pool in enumerate(task_config_pool_batches): num_tasks = len(task_config_pool) logger.info(f"Starting batch {batch_idx} with {num_tasks} tasks") + # Since the reset and evaluation are based on local files; we need to reset the salesforce orgs first + logger.info(f"Bulk resetting the salesforce orgs...") + time_start=time.perf_counter() + file=os.path.join(args.result_dir,"reset.log") + with capture_logs_to_file(file): + run_reset(task_config_pool, args.org_alias) + + time_end=time.perf_counter() + logger.info(f"Done bulk resetting the salesforce orgs in {time_end-time_start:.2f} seconds") semaphore = asyncio.Semaphore(args.max_concurrent_tasks) job_queue = [] for task_instance_dict in task_config_pool: From 38260a376dec94c6abce2420045ad2a5e791e342 Mon Sep 17 00:00:00 2001 From: Krithika Ramakrishnan Date: Mon, 10 Nov 2025 13:35:28 -0500 Subject: [PATCH 13/21] Install prerequisites one by one --- scuba/phases/prerequisites.py | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/scuba/phases/prerequisites.py b/scuba/phases/prerequisites.py index d272574..1ba749a 100644 --- a/scuba/phases/prerequisites.py +++ b/scuba/phases/prerequisites.py @@ -58,11 +58,12 @@ def install_prerequisites(self): self.__install_prerequisite_data() def __install_prerequisite_metadata(self): - package_changes_types_and_members = {} + self.types_and_members.update({ 'Settings': ['Knowledge', 'ServiceSetupAssistant', 'Quote', 'Entitlement'] }) for type, members in self.types_and_members.items(): + package_changes_types_and_members={} folder_name_for_type = convert_type_to_folder_name(type) for member in members: pattern_to_search = f"{PREREQUISITES_FOLDER}/metadata/{folder_name_for_type}/{member}*" @@ -83,13 +84,13 @@ def __install_prerequisite_metadata(self): shutil.copytree(files[0], os.path.join(str(destination_dir), os.path.basename(files[0])), dirs_exist_ok=True) package_changes_types_and_members.setdefault(type, []) package_changes_types_and_members[type].append(member) - if package_changes_types_and_members: - create_metadata_info_xml(package_changes_types_and_members, self.manifest_dir, is_destructive=False) - create_metadata_info_xml({}, self.manifest_dir, is_destructive=True) - try: - deploy(self.modified_orgs_dir, self.org_alias) - except Exception as e: - logger.error(traceback.format_exc()) + if package_changes_types_and_members: + create_metadata_info_xml(package_changes_types_and_members, self.manifest_dir, is_destructive=False) + create_metadata_info_xml({}, self.manifest_dir, is_destructive=True) + try: + deploy(self.modified_orgs_dir, self.org_alias) + except Exception as e: + logger.error(traceback.format_exc()) def __get_id_for_dependency(self, object_name, field, value_name): soql = f"SELECT Id FROM {object_name} WHERE {field} = {value_name}" @@ -117,10 +118,14 @@ def __post_record(self, object_name, record): info = {key: record[key] for key in object_unique_keys_map.get(object_name)} if data_exists: logger.info(f'{object_name} {info} already exists in org {self.org_alias} with ID: {id}. Patching...') - patch(self.org_alias, f'/services/data/v62.0/sobjects/{object_name}/{id}', record) + status, details = patch(self.org_alias, f'/services/data/v62.0/sobjects/{object_name}/{id}', record) + if not status: + logger.error(f'Patching {object_name} {info} failed. Details: {details}') else: logger.info(f'{object_name} {info} does not exist in org {self.org_alias}. Creating...') - post(self.org_alias, f'/services/data/v62.0/sobjects/{object_name}', record) + status, details = post(self.org_alias, f'/services/data/v62.0/sobjects/{object_name}', record) + if not status: + logger.error(f'Prerequisite {object_name} {info} failed. Details: {details}') def __create_records(self, object_name, records): threads = [] @@ -135,7 +140,9 @@ def __create_records(self, object_name, records): def __generate_pricebook_records(self, products_filepath, pricebook_entry_filepath): data = json.load(open(products_filepath)) _, pricebook_id = self.__get_id_for_dependency('Pricebook2', 'IsStandard', 'true') - patch(self.org_alias, f'/services/data/v62.0/sobjects/Pricebook2/{pricebook_id}', {'IsActive': True}) + status, details = patch(self.org_alias, f'/services/data/v62.0/sobjects/Pricebook2/{pricebook_id}', {'IsActive': True}) + if not status: + logger.error(f'Failed to activate standard pricebook. Details: {details}') pricebook_entry_records = [] for record in data: product_name = record['Name'] @@ -200,8 +207,10 @@ def initial_setup(self): os.remove(f'{nickname}.csv') for id in user_ids: endpoint = f'/services/data/v62.0/sobjects/User/{id}' - patch(self.org_alias, endpoint, {'UserPermissionsMarketingUser': True, + status, details = patch(self.org_alias, endpoint, {'UserPermissionsMarketingUser': True, 'UserPermissionsKnowledgeUser': True}) + if not status: + logger.error(f'Failed to add marketing and knowledge permissions to user {id}.') def __install_prerequisite_data(self): objects = self.data_prerequisites.get('objects', []) From 5721c238039d834c5c3c12214516c90672279eec Mon Sep 17 00:00:00 2001 From: Krithika Ramakrishnan Date: Mon, 10 Nov 2025 13:36:03 -0500 Subject: [PATCH 14/21] Log resetting errors --- scuba/phases/resetter.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scuba/phases/resetter.py b/scuba/phases/resetter.py index dd8d86d..84abd93 100644 --- a/scuba/phases/resetter.py +++ b/scuba/phases/resetter.py @@ -150,7 +150,9 @@ def __reset_data(self): id = record['Id'] del record['Id'] endpoint = f'/services/data/v62.0/sobjects/{o}/{id}' - patch(self.org_alias, endpoint, record) + status, details = patch(self.org_alias, endpoint, record) + if not status: + logger.error(f'Failed to update {o} object {id}. Details: {details}') if os.path.exists(f'{o}.csv'): os.remove(f'{o}.csv') From d4561a82c4bc503b199920bf91b4949e8dab0888 Mon Sep 17 00:00:00 2001 From: Krithika Ramakrishnan Date: Mon, 10 Nov 2025 15:14:10 -0500 Subject: [PATCH 15/21] Fix Product2 reset issue --- data/test_demo_aug.json | 10 ++++++++++ data/test_zero_shot.json | 10 ++++++++++ 2 files changed, 20 insertions(+) diff --git a/data/test_demo_aug.json b/data/test_demo_aug.json index 4fc3411..14c85ea 100644 --- a/data/test_demo_aug.json +++ b/data/test_demo_aug.json @@ -8386,6 +8386,8 @@ } }, "objects": [ + "Quote", + "Opportunity", "PricebookEntry", "Pricebook2" ], @@ -8429,6 +8431,8 @@ } }, "objects": [ + "Quote", + "Opportunity", "PricebookEntry", "Pricebook2" ], @@ -8472,6 +8476,8 @@ } }, "objects": [ + "Quote", + "Opportunity", "PricebookEntry", "Pricebook2" ], @@ -8515,6 +8521,8 @@ } }, "objects": [ + "Quote", + "Opportunity", "PricebookEntry", "Pricebook2" ], @@ -8558,6 +8566,8 @@ } }, "objects": [ + "Quote", + "Opportunity", "PricebookEntry", "Pricebook2" ], diff --git a/data/test_zero_shot.json b/data/test_zero_shot.json index fb0a298..ef75a3e 100644 --- a/data/test_zero_shot.json +++ b/data/test_zero_shot.json @@ -8185,6 +8185,8 @@ } }, "objects": [ + "Quote", + "Opportunity", "PricebookEntry", "Pricebook2" ], @@ -8227,6 +8229,8 @@ } }, "objects": [ + "Quote", + "Opportunity", "PricebookEntry", "Pricebook2" ], @@ -8269,6 +8273,8 @@ } }, "objects": [ + "Quote", + "Opportunity", "PricebookEntry", "Pricebook2" ], @@ -8311,6 +8317,8 @@ } }, "objects": [ + "Quote", + "Opportunity", "PricebookEntry", "Pricebook2" ], @@ -8353,6 +8361,8 @@ } }, "objects": [ + "Quote", + "Opportunity", "PricebookEntry", "Pricebook2" ], From cfd21289ec8e0859f29fb86887ca7245df425b88 Mon Sep 17 00:00:00 2001 From: Krithika Ramakrishnan Date: Mon, 10 Nov 2025 15:18:10 -0500 Subject: [PATCH 16/21] Show object delete failures --- scuba/phases/resetter.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/scuba/phases/resetter.py b/scuba/phases/resetter.py index 84abd93..25693f8 100644 --- a/scuba/phases/resetter.py +++ b/scuba/phases/resetter.py @@ -66,12 +66,17 @@ def __reset_validation_rule(self): url = raw_response.get('records')[0]['attributes']['url'] delete(self.org_alias, url) + def __execute_delete(self, command): + stdout, stderr = execute_sfdx_command(command) + if stderr: + logger.error(stderr) + def __bulk_delete(self, object_name, record_ids): username = get_org_info(self.org_alias)['username'] threads = [] for id in record_ids: delete_command=f'sf data delete record --sobject {object_name} --record-id {id} -o {username}' - thread=threading.Thread(target=execute_sfdx_command,args=(delete_command,)) + thread=threading.Thread(target=self.__execute_delete,args=(delete_command,)) threads.append(thread) thread.start() for thread in tqdm(threads,desc="Deleting records"): From 8445b7889f9fc23355379323b05e01ed9fe02a13 Mon Sep 17 00:00:00 2001 From: Krithika Ramakrishnan Date: Mon, 10 Nov 2025 15:49:19 -0500 Subject: [PATCH 17/21] Fix service_002 reset issue --- scuba/phases/resetter.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/scuba/phases/resetter.py b/scuba/phases/resetter.py index 25693f8..2bee109 100644 --- a/scuba/phases/resetter.py +++ b/scuba/phases/resetter.py @@ -189,6 +189,23 @@ def __deploy_diff(self): logger.info(f'Failed to deploy {type}. Traceback: {traceback.format_exc()}') elif type == 'ValidationRule': self.__reset_validation_rule() + elif type in ['AssignmentRules']: + query = f'SELECT Id, SObjectType, Name FROM AssignmentRule WHERE SystemModstamp >= LAST_N_DAYS:20 AND LastModifiedBy.Username=\'{os.environ["SALESFORCE_USERNAME"]}\'' + run_query(query, type, self.org_alias) + try: + df = pd.read_csv(f'{type}.csv') + df['member']=df['SobjectType']+'.'+df['Name'] + new_members = df['member'].values.tolist() + for member in new_members: + destructive_changes_types_and_members = {'AssignmentRule': [member]} + create_metadata_info_xml(destructive_changes_types_and_members, self.manifest_dir, is_destructive=True) + create_metadata_info_xml({}, self.manifest_dir, is_destructive=False) + try: + deploy(self.modified_orgs_dir, self.org_alias) + except DeployError as exc: + logger.info(f'Failed to deploy {type}. Traceback: {traceback.format_exc()}') + except (EmptyDataError, Exception) as exc: + continue elif type in ['Report']: query = f'SELECT Id FROM Report WHERE SystemModstamp >= LAST_N_DAYS:10' run_query(query, type, self.org_alias) From f56c996ffc7493c04a85cb6834b27f9a65b55314 Mon Sep 17 00:00:00 2001 From: Krithika Ramakrishnan Date: Mon, 10 Nov 2025 15:52:58 -0500 Subject: [PATCH 18/21] Fix UserRole reset issue for admin_035 --- scuba/phases/resetter.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/scuba/phases/resetter.py b/scuba/phases/resetter.py index 2bee109..4321182 100644 --- a/scuba/phases/resetter.py +++ b/scuba/phases/resetter.py @@ -86,6 +86,9 @@ def __reset_data(self): """ Resets the data in the Salesforce org by deleting records created after the last reset. """ + # Delete UserRole objects twice to remove dependencies + if 'UserRole' in self.objects: + self.objects.append('UserRole') for object in self.objects: if object == 'Queue': query = f'SELECT FIELDS(ALL) FROM Group WHERE Type = \'{object}\' AND SystemModstamp >= LAST_N_DAYS:30 LIMIT 200' From d02aed16f11b90e9a4595601c2d4e406352eb562 Mon Sep 17 00:00:00 2001 From: Krithika Ramakrishnan Date: Mon, 10 Nov 2025 15:53:24 -0500 Subject: [PATCH 19/21] Fix dependency of Queue on Assignment rule for service_001 --- data/test_demo_aug.json | 10 +++++----- data/test_zero_shot.json | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/data/test_demo_aug.json b/data/test_demo_aug.json index 14c85ea..ee7d04e 100644 --- a/data/test_demo_aug.json +++ b/data/test_demo_aug.json @@ -10498,7 +10498,7 @@ "QueueSobject", "Queue" ], - "metadata_types": [] + "metadata_types": ["AssignmentRules"] }, "instance_dict": { "queue_name": "Shoe Case Support", @@ -10546,7 +10546,7 @@ "QueueSobject", "Queue" ], - "metadata_types": [] + "metadata_types": ["AssignmentRules"] }, "instance_dict": { "queue_name": "VIP Support", @@ -10594,7 +10594,7 @@ "QueueSobject", "Queue" ], - "metadata_types": [] + "metadata_types": ["AssignmentRules"] }, "instance_dict": { "queue_name": "Marketing Support", @@ -10641,7 +10641,7 @@ "QueueSobject", "Queue" ], - "metadata_types": [] + "metadata_types": ["AssignmentRules"] }, "instance_dict": { "queue_name": "tech support specialists", @@ -10689,7 +10689,7 @@ "QueueSobject", "Queue" ], - "metadata_types": [] + "metadata_types": ["AssignmentRules"] }, "instance_dict": { "queue_name": "enterprise client support", diff --git a/data/test_zero_shot.json b/data/test_zero_shot.json index ef75a3e..8abfe58 100644 --- a/data/test_zero_shot.json +++ b/data/test_zero_shot.json @@ -10253,7 +10253,7 @@ "QueueSobject", "Queue" ], - "metadata_types": [] + "metadata_types": ["AssignmentRules"] }, "instance_dict": { "queue_name": "Shoe Case Support", @@ -10300,7 +10300,7 @@ "QueueSobject", "Queue" ], - "metadata_types": [] + "metadata_types": ["AssignmentRules"] }, "instance_dict": { "queue_name": "VIP Support", @@ -10347,7 +10347,7 @@ "QueueSobject", "Queue" ], - "metadata_types": [] + "metadata_types": ["AssignmentRules"] }, "instance_dict": { "queue_name": "Marketing Support", @@ -10393,7 +10393,7 @@ "QueueSobject", "Queue" ], - "metadata_types": [] + "metadata_types": ["AssignmentRules"] }, "instance_dict": { "queue_name": "tech support specialists", @@ -10440,7 +10440,7 @@ "QueueSobject", "Queue" ], - "metadata_types": [] + "metadata_types": ["AssignmentRules"] }, "instance_dict": { "queue_name": "enterprise client support", From 0753164220a054775d51f134752e4f89b328d04a Mon Sep 17 00:00:00 2001 From: Krithika Ramakrishnan Date: Mon, 10 Nov 2025 16:23:30 -0500 Subject: [PATCH 20/21] Add logging to test files to print log to stdout --- tests/test_prerequisite.py | 2 ++ tests/test_resetter.py | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/tests/test_prerequisite.py b/tests/test_prerequisite.py index fe292c8..0bf794f 100644 --- a/tests/test_prerequisite.py +++ b/tests/test_prerequisite.py @@ -2,6 +2,8 @@ import json import os os.chdir('..') +import logging +logging.basicConfig(level=logging.INFO) from scuba.phases.prerequisites import Prerequisites from scuba.helpers.salesforce_commands import authorize_using_access_token diff --git a/tests/test_resetter.py b/tests/test_resetter.py index 3421d09..107df67 100644 --- a/tests/test_resetter.py +++ b/tests/test_resetter.py @@ -3,6 +3,10 @@ from tqdm import tqdm import os os.chdir('..') +from dotenv import load_dotenv +load_dotenv() +import logging +logging.basicConfig(level=logging.INFO) from scuba.phases.resetter import Resetter from scuba.helpers.salesforce_commands import authorize_using_access_token From 711feec1fe8d21c478c44a8319541ebf87447e92 Mon Sep 17 00:00:00 2001 From: Yutong Dai Date: Mon, 16 Feb 2026 22:26:01 +0000 Subject: [PATCH 21/21] update readme --- .../vendor/OSworld/desktop_env/server/README.md | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) diff --git a/envs/remote_docker_env/vendor/OSworld/desktop_env/server/README.md b/envs/remote_docker_env/vendor/OSworld/desktop_env/server/README.md index afc8a3b..acca233 100644 --- a/envs/remote_docker_env/vendor/OSworld/desktop_env/server/README.md +++ b/envs/remote_docker_env/vendor/OSworld/desktop_env/server/README.md @@ -369,18 +369,7 @@ sudo systemctl start osworld_server.service ### Accessibility Tree Support -To support the accessibility tree functionality, you'll need to install pyastpi2 in your Ubuntu environment. This package enables access to accessibility information and tree structures. - -Installation steps: - -```bash -# Update package list and ensure pip is installed -sudo apt-get update -sudo apt-get install python3-pip - -# Install pyastpi2 using pip -pip3 install pyastpi2 -``` +Due to Salesforce security requirements, we’re unable to provide the solution here. Please refer to the instructions in the original repository and proceed at your own discretion. ### Xorg Configuration