Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,9 @@ transformers
typing_extensions
validators
xlsxwriter
# TODO: newly added dependencies
datasets
rdkit
rdchiral
rouge_score
scikit-learn
5 changes: 4 additions & 1 deletion vlmeval/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@
from .medqbench_mcq import MedqbenchMCQDataset
from .medqbench_caption import MedqbenchCaptionDataset
from .medqbench_paired_description import MedqbenchPairedDescriptionDataset
from .llm4chem import LLM4Chem


class ConcatDataset(ImageBaseDataset):
Expand All @@ -103,6 +104,7 @@ class ConcatDataset(ImageBaseDataset):
'ScreenSpot': ['ScreenSpot_Mobile', 'ScreenSpot_Desktop', 'ScreenSpot_Web'],
'ScreenSpot_v2': ['ScreenSpot_v2_Mobile', 'ScreenSpot_v2_Desktop', 'ScreenSpot_v2_Web'],
'M4Bench': ['State_Invariance', 'State_Comparison', 'Spatial_Perception', 'Instance_Comparison', 'Detailed_Difference'], # noqa: E501
'LLM4Chem' : ['forward_synthesis', 'retrosynthesis', 'molecule_captioning', 'molecule_generation', 'name_conversion-i2f', 'name_conversion-i2s', 'name_conversion-s2f', 'name_conversion-s2i', 'property_prediction-esol', 'property_prediction-lipo', 'property_prediction-bbbp', 'property_prediction-clintox', 'property_prediction-hiv', 'property_prediction-sider', 'retrosynthesis_uspto50k'],
}

def __init__(self, dataset):
Expand Down Expand Up @@ -228,7 +230,8 @@ def evaluate(self, eval_file, **judge_kwargs):
]

TEXT_DATASET = [
TextMCQDataset
TextMCQDataset,
LLM4Chem,
]

CUSTOM_DATASET = [
Expand Down
181 changes: 181 additions & 0 deletions vlmeval/dataset/llm4chem.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
import warnings
import pandas as pd
import re
from abc import abstractmethod
from sklearn.metrics import matthews_corrcoef, accuracy_score, r2_score, roc_auc_score, precision_score, recall_score, mean_absolute_error
from ..smp import *
from .text_base import TextBaseDataset
from .utils.llm4chem.utils.metrics import calculate_smiles_metrics, calculate_formula_metrics, calculate_text_metrics, calculate_number_metrics, calculate_boolean_metrics
from .utils.llm4chem.config import TASKS, TASK_TAGS, TASKS_WITH_SEMICOLON_REPLACE

# TODO: Now only support Top1 evaluation for generation tasks, need to support TopK evaluation in the future

# TODO: Modify the root path to https://opencompass.openxlab.space/ ... as needed
# TODO: See tos://tos-bjml-ai4scilab/scievalkit_benchmark/LLM4Chem/
DATASET_ROOT_PATH = "LLM4Chem/"

def extract_answer_part(outputs, left_tag, right_tag, mode='tag'):
assert mode in ('tag', 'direct')

assert isinstance(outputs, list)
answers = []
for text in outputs:
if mode == 'direct' or (left_tag is None and right_tag is None):
text = text.replace('<unk>', '').replace('</s>', '').strip()
answers.append(text.strip())
continue

left_tag_pos = text.find(left_tag)
if left_tag_pos == -1:
answers.append('')
continue
right_tag_pos = text.find(right_tag)
if right_tag_pos == -1:
answers.append('')
continue
text = text[left_tag_pos + len(left_tag): right_tag_pos].strip()
answers.append(text)
return answers

def LLM4Chem_postprocess(text, task, *args, **kwargs):
# delete content within <think> </think>
text = re.sub(r'<think>.*?</think>', '', text, flags=re.DOTALL)
replace_semicolon = task in TASKS_WITH_SEMICOLON_REPLACE
pred = extract_answer_part([text], *(TASK_TAGS[task]), mode='tag')[0]
# task in TASKS_WITH_SEMICOLON_REPLACE needs semicolon replaced with a period
if replace_semicolon:
pred = pred.replace(';', '.')
# no matched tag
if pred == '':
tag = TASK_TAGS[task][0]

if (tag == '<BOOLEAN>'):
# find the last yes/true/no/false in text, case insensitive
ans = re.findall(r'\b(?:yes|true|no|false)\b', text, re.IGNORECASE)
if ans:
# if ans[-1] is yes/true
if ans[-1].lower() in ('yes', 'true'):
return 'Yes'
else:
return 'No'
else:
return ''

if (tag == '<NUMBER>'):
# find the last number in text
# remove content within <SMILES> </SMILES> from text
text_2 = re.sub(r'<SMILES>.*?</SMILES>', '', text, flags=re.DOTALL)
ans = re.findall(r'-?\d*\.\d+|-?\d+', text_2)
if ans:
return ans[-1]
else:
return ''

if (tag == '<MOLFORMULA>'):
# find the last chemical formula in text
ans = re.findall(r'[\[\(]?[A-Z][a-z]?\d*(?:\([A-Za-z0-9]+\)\d*)?[\]\)]?(?:[A-Z][a-z]?\d*|\([^\)]+\)\d*|\[[^\]]+\]\d*)*(?:[+-]{1,2})?(?:·\d*[A-Z][a-z]?\d*)*', text)
if ans:
return ans[-1]
else:
return ''


# print(f"prediction: {pred}")
return pred

# WARNING: You should ensure the Internet is connected while running this function for the first time.
def download_nltk():
import nltk
conda_prefix = os.environ.get('CONDA_PREFIX', None)
if conda_prefix is not None:
nltk_data_dir = os.path.join(conda_prefix, 'nltk_data')
nltk.download('wordnet', download_dir=nltk_data_dir, quiet=True)
nltk.data.path.append(nltk_data_dir)
else:
nltk.download('wordnet', quiet=True)
nltk_data_dir = '~/nltk_data'
print(f"NLTK 'wordnet' downloaded to: {nltk_data_dir}")

class LLM4Chem(TextBaseDataset):
TYPE = 'TEXT'

DATASET_URL = {
# TODO: DATASET_ROOT_PATH is a placeholder, need to modify to actual path or URL
task: DATASET_ROOT_PATH + f"LLM4Chem_{task}.tsv" for task in TASKS
}

DATASET_MD5 = {
task : None for task in TASKS
}

# It returns a DataFrame
@classmethod
def evaluate(self, eval_file, **judge_kwargs):

result_values = []
label_values = []
data = load(eval_file)
data= data[~pd.isna(data["prediction"])]
assert 'answer' in data and 'prediction' in data
dataset_name = None
for name in self.DATASET_URL:
if name in eval_file:
dataset_name = name
break
task = dataset_name
answers = []
predictions = []
for index,entry in data.iterrows():
answers.append(LLM4Chem_postprocess(str(entry['answer']), task))
predictions.append(LLM4Chem_postprocess(str(entry['prediction']), task))

answers = [[ans] for ans in answers]
predictions = [[pred] for pred in predictions]
# TODO: Now the following code support TopK evaluation, but from EvalKit framework, we cannot get K generated answers.

pred_list = predictions
gold_list = answers

if (task == 'molecule_captioning'):
download_nltk() # Ensure NLTK wordnet is downloaded

if task in ('property_prediction-esol', 'property_prediction-lipo', 'property_prediction-bbbp', 'property_prediction-clintox', 'property_prediction-hiv', 'property_prediction-sider'):
# set pred_list to [length * 1]
pred_list = [[pred[0]] for pred in pred_list]

if task in ('forward_synthesis', 'molecule_generation', 'name_conversion-i2s'):
r = calculate_smiles_metrics(pred_list, gold_list)
elif task in ('retrosynthesis', 'retrosynthesis_uspto50k'):
r = calculate_smiles_metrics(pred_list, gold_list, metrics=('exact_match', 'fingerprint', 'multiple_match'))
elif task in ('molecule_captioning',):
r = calculate_text_metrics(
pred_list,
gold_list,
text_model='allenai/scibert_scivocab_uncased',
text_trunc_length=2048,
)
elif task in ('name_conversion-i2f', 'name_conversion-s2f'):
r = calculate_formula_metrics(pred_list, gold_list, metrics=('element_match',))
elif task in ('name_conversion-s2i',):
r = calculate_formula_metrics(pred_list, gold_list, metrics=('split_match',))
elif task in ('property_prediction-esol', 'property_prediction-lipo'):
r = calculate_number_metrics(pred_list, gold_list)
elif task in ('property_prediction-bbbp', 'property_prediction-clintox', 'property_prediction-hiv', 'property_prediction-sider'):
r = calculate_boolean_metrics(pred_list, gold_list)
else:
raise ValueError(task)

if 'num_t1_exact_match' in r and 'num_all' in r:
r['top1_exact_match'] = round(r['num_t1_exact_match'] / r['num_all'] * 100, 2)
if 'num_t5_exact_match' in r and 'num_all' in r:
r['top5_exact_match'] = round(r['num_t5_exact_match'] / r['num_all'] * 100, 2)
if 'num_t1_ele_match' in r and 'num_all' in r:
r['top1_ele_match'] = round(r['num_t1_ele_match'] / r['num_all'] * 100, 2)
if 'num_correct' in r and 'num_all' in r:
r['accuracy'] = round(r['num_correct'] / r['num_all'] * 100, 2)
if 'num_t1_split_match' in r and 'num_all' in r:
r['top1_split_match'] = round(r['num_t1_split_match'] / r['num_all'] * 100, 2)
if 'num_t5_split_match' in r and 'num_all' in r:
r['top5_split_match'] = round(r['num_t5_split_match'] / r['num_all'] * 100, 2)

return r
168 changes: 168 additions & 0 deletions vlmeval/dataset/utils/llm4chem/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
# https://github.com/OSU-NLP-Group/LLM4Chem
# https://github.com/otori-bird/retrosynthesis

TASKS = (
'forward_synthesis',
'retrosynthesis',
'molecule_captioning',
'molecule_generation',
'name_conversion-i2f',
'name_conversion-i2s',
'name_conversion-s2f',
'name_conversion-s2i',
'property_prediction-esol',
'property_prediction-lipo',
'property_prediction-bbbp',
'property_prediction-clintox',
'property_prediction-hiv',
'property_prediction-sider',
'retrosynthesis_uspto50k',
)


DEFAULT_MAX_INPUT_TOKENS = 512
DEFAULT_MAX_NEW_TOKENS = 1024


TASKS_GENERATION_SETTINGS = {
'forward_synthesis': {
'generation_kargs': {
'num_return_sequences': 5,
'num_beams': 8
},
},
'retrosynthesis': {
'max_new_tokens': 960,
'generation_kargs': {
'num_return_sequences': 10,
'num_beams': 13
},
},
'molecule_captioning': {
'generation_kargs': {
'num_return_sequences': 1,
'num_beams': 4
},
},
'molecule_generation': {
'generation_kargs': {
'num_return_sequences': 5,
'num_beams': 8
},
},
'name_conversion-i2f': {
'max_new_tokens': 20,
'generation_kargs': {
'num_return_sequences': 3,
'num_beams': 6
},
},
'name_conversion-i2s': {
'generation_kargs': {
'num_return_sequences': 5,
'num_beams': 8
},
},
'name_conversion-s2f': {
'max_new_tokens': 20,
'generation_kargs': {
'num_return_sequences': 3,
'num_beams': 6
},
},
'name_conversion-s2i': {
'generation_kargs': {
'num_return_sequences': 5,
'num_beams': 8
},
},
'property_prediction-esol': {
'batch_size': 16,
'max_new_tokens': 20,
'generation_kargs': {
'num_return_sequences': 1,
'num_beams': 4,
},
},
'property_prediction-lipo': {
'batch_size': 16,
'max_new_tokens': 20,
'generation_kargs': {
'num_return_sequences': 1,
'num_beams': 4,
},
},
'property_prediction-bbbp': {
'batch_size': 16,
'max_new_tokens': 20,
'generation_kargs': {
'num_return_sequences': 1,
'num_beams': 4,
},
},
'property_prediction-clintox': {
'batch_size': 16,
'max_new_tokens': 20,
'generation_kargs': {
'num_return_sequences': 1,
'num_beams': 4,
},
},
'property_prediction-hiv': {
'batch_size': 16,
'max_new_tokens': 20,
'generation_kargs': {
'num_return_sequences': 1,
'num_beams': 4,
},
},
'property_prediction-sider': {
'batch_size': 16,
'max_new_tokens': 20,
'generation_kargs': {
'num_return_sequences': 1,
'num_beams': 4,
},
},
}


TASK_TAGS = {
'forward_synthesis': ('<SMILES>', '</SMILES>'),
'retrosynthesis': ('<SMILES>', '</SMILES>'),
'molecule_generation': ('<SMILES>', '</SMILES>'),
'molecule_captioning': (None, None),
'name_conversion-i2f': ('<MOLFORMULA>', '</MOLFORMULA>'),
'name_conversion-i2s': ('<SMILES>', '</SMILES>'),
'name_conversion-s2f': ('<MOLFORMULA>', '</MOLFORMULA>'),
'name_conversion-s2i': ('<IUPAC>', '</IUPAC>'),
'property_prediction-esol': ('<NUMBER>', '</NUMBER>'),
'property_prediction-lipo': ('<NUMBER>', '</NUMBER>'),
'property_prediction-bbbp': ('<BOOLEAN>', '</BOOLEAN>'),
'property_prediction-clintox': ('<BOOLEAN>', '</BOOLEAN>'),
'property_prediction-hiv': ('<BOOLEAN>', '</BOOLEAN>'),
'property_prediction-sider': ('<BOOLEAN>', '</BOOLEAN>'),
'retrosynthesis_uspto50k': ('<SMILES>', '</SMILES>'),
}


# These tasks output SMILES, where there may be semicolons that separate different parts. To facilitate evaluation, each semicolon is replaced by a dot.
TASKS_WITH_SEMICOLON_REPLACE = ('forward_synthesis', 'retrosynthesis', 'molecule_generation', 'name_conversion-i2s', 'retrosynthesis_uspto50k')


# For these tasks, one input might have multiple gold answers, so the gold answer should be directly obtained from the dataset instead of directly using the gold domain of each sample.
TASKS_WITH_READING_GOLD_FROM_DATASET = (
'forward_synthesis', 'retrosynthesis',
'molecule_generation', 'molecule_captioning',
'name_conversion-i2f', 'name_conversion-i2s',
'name_conversion-s2f', 'name_conversion-s2i',
'retrosynthesis_uspto50k',
)


BASE_MODELS = {
'osunlp/LlaSMol-Mistral-7B': 'mistralai/Mistral-7B-v0.1',
'osunlp/LlaSMol-Galactica-6.7B': 'facebook/galactica-6.7b',
'osunlp/LlaSMol-Llama2-7B': 'meta-llama/Llama-2-7b-hf',
'osunlp/LlaSMol-CodeLlama-7B': 'codellama/CodeLlama-7b-hf',
}
1 change: 1 addition & 0 deletions vlmeval/dataset/utils/llm4chem/utils/__input__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
import smiles_canonicalization
Loading
Loading