-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathprep_patent_data.py
More file actions
executable file
·48 lines (38 loc) · 1.55 KB
/
prep_patent_data.py
File metadata and controls
executable file
·48 lines (38 loc) · 1.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
from dataset import PatentDescDataset
import json
from tqdm import tqdm
import argparse
import os
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--desc_type', type=str, required=True)
parser.add_argument('--split', type=str, required=True)
parser.add_argument('--data_dir', type=str, required=True)
args = parser.parse_args()
print(f"Preparing {args.desc_type} descriptions for {args.split} split")
llava_dataset = []
data = PatentDescDataset(split=args.split, desc_type=args.desc_type, data_dir=args.data_dir, ocr_only=True)
spcl_tokens = ["<image>", "<im_patch>", "<im_start>", "<im_end>", "<image-placeholder>", "<", ">"]
reps = {x: x.replace('<', '[[').replace('>', ']]') for x in spcl_tokens}
for sample in tqdm(data):
desc = sample['description']
for tok in spcl_tokens:
desc = desc.replace(tok, reps[tok])
_sample = {
"id": sample['fig_id'],
"image": f"{sample['fig_id']}.png",
"conversations": [
{
"from": "human",
"value": f"<image>\nWrite a {args.desc_type} description for this patent image."
},
{
"from": "gpt",
"value": desc
},
]
}
llava_dataset.append(_sample)
print(f"Writing {args.desc_type} descriptions for {args.split} split")
with open(os.path.join(args.data_dir, f'llava_json/{args.desc_type}_{args.split}'), 'w') as f:
json.dump(llava_dataset, f)