Hello, I would like to ask if I trained the YOLO World model with 48 visible classes from the Coco dataset and tested it with 17 invisible classes. The training loss can be reduced, but the test result mAP is very small. Is there a problem with my settings?
【Dataload Settings】
instances_val2017_novel.json:Only retain the relevant annotations for novel classes, categories have not been changed in the instances_val2017 file.
novel_classes=("umbrella","cow","cup","bus","keyboard","skateboard","dog","couch","tie","snowboard","sink","elephant","cake","scissors","airplane","cat","knife")
mscoco_unseen_classes.json=[["umbrella"], ["cow"],["cup"], ["bus"],["keyboard"],["skateboard"], ["dog"], ["couch"],["tie"],["snowboard"],
["sink"],["elephant"],["cake"],["scissors"],["airplane"],["cat"],["knife"]]
coco_train_dataset = dict(delete=True,
type='MultiModalDataset',
dataset=dict(
type='YOLOv5CocoDataset',
metainfo=dict(classes=base_classes),
data_root='data/coco/',
ann_file='annotations/instances_train2017_base.json',
data_prefix=dict(img='train2017/'),
filter_cfg=dict(filter_empty_gt=False, min_size=32)),
class_text_path='data/texts/mscoco_seen_classes.json',
pipeline=train_pipeline)
train_dataloader = dict(
batch_size=train_batch_size_per_gpu,
collate_fn=dict(type='yolow_collate'),
dataset=coco_train_dataset)
test_pipeline = [
*base.test_pipeline[:-1],
dict(type='LoadText'),
dict(type='mmdet.PackDetInputs',
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
'scale_factor', 'pad_param', 'texts'))
]
coco_val_dataset = dict(delete=True,
type='MultiModalDataset',
dataset=dict(
type='YOLOv5CocoDataset',
metainfo=dict(classes=novel_classes),
data_root='data/coco/',
ann_file='annotations/instances_val2017_novel.json',
data_prefix=dict(img='val2017/'),
filter_cfg=dict(filter_empty_gt=False, min_size=32)),
class_text_path='data/texts/mscoco_unseen_classes.json',
pipeline=test_pipeline)
【Model Settings 】
yolo_world_v2_l_clip_large.
text_transform = [
dict(type='RandomLoadText',
num_neg_samples=(num_classes, num_classes),
max_num_samples=num_training_classes,
padding_to_max=True,
padding_value=''),
dict(type='mmdet.PackDetInputs',
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
'flip_direction', 'texts'))
]
mosaic_affine_transform = [
dict(
type='MultiModalMosaic',
img_scale=base.img_scale,
pad_val=114.0,
pre_transform=base.pre_transform),
dict(
type='YOLOv5RandomAffine',
max_rotate_degree=0.0,
max_shear_degree=0.0,
max_aspect_ratio=100.,
scaling_ratio_range=(1 - base.affine_scale,
1 + base.affine_scale),
# img_scale is (width, height)
border=(-base.img_scale[0] // 2, -base.img_scale[1] // 2),
border_val=(114, 114, 114))
]
train_pipeline = [
*base.pre_transform,
mosaic_affine_transform,
dict(
type='YOLOv5MultiModalMixUp',
prob=base.mixup_prob,
pre_transform=[base.pre_transform,
*mosaic_affine_transform]),
*base.last_transform[:-1],
*text_transform
]
train_pipeline_stage2 = [
*base.train_pipeline_stage2[:-1],
*text_transform
]
【Test results】

Hello, I would like to ask if I trained the YOLO World model with 48 visible classes from the Coco dataset and tested it with 17 invisible classes. The training loss can be reduced, but the test result mAP is very small. Is there a problem with my settings?
【Dataload Settings】
instances_val2017_novel.json:Only retain the relevant annotations for novel classes, categories have not been changed in the instances_val2017 file.
novel_classes=("umbrella","cow","cup","bus","keyboard","skateboard","dog","couch","tie","snowboard","sink","elephant","cake","scissors","airplane","cat","knife")
mscoco_unseen_classes.json=[["umbrella"], ["cow"],["cup"], ["bus"],["keyboard"],["skateboard"], ["dog"], ["couch"],["tie"],["snowboard"],
["sink"],["elephant"],["cake"],["scissors"],["airplane"],["cat"],["knife"]]
coco_train_dataset = dict(delete=True,
type='MultiModalDataset',
dataset=dict(
type='YOLOv5CocoDataset',
metainfo=dict(classes=base_classes),
data_root='data/coco/',
ann_file='annotations/instances_train2017_base.json',
data_prefix=dict(img='train2017/'),
filter_cfg=dict(filter_empty_gt=False, min_size=32)),
class_text_path='data/texts/mscoco_seen_classes.json',
pipeline=train_pipeline)
train_dataloader = dict(
batch_size=train_batch_size_per_gpu,
collate_fn=dict(type='yolow_collate'),
dataset=coco_train_dataset)
test_pipeline = [
*base.test_pipeline[:-1],
dict(type='LoadText'),
dict(type='mmdet.PackDetInputs',
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape',
'scale_factor', 'pad_param', 'texts'))
]
coco_val_dataset = dict(delete=True,
type='MultiModalDataset',
dataset=dict(
type='YOLOv5CocoDataset',
metainfo=dict(classes=novel_classes),
data_root='data/coco/',
ann_file='annotations/instances_val2017_novel.json',
data_prefix=dict(img='val2017/'),
filter_cfg=dict(filter_empty_gt=False, min_size=32)),
class_text_path='data/texts/mscoco_unseen_classes.json',
pipeline=test_pipeline)
【Model Settings 】
yolo_world_v2_l_clip_large.
text_transform = [
dict(type='RandomLoadText',
num_neg_samples=(num_classes, num_classes),
max_num_samples=num_training_classes,
padding_to_max=True,
padding_value=''),
dict(type='mmdet.PackDetInputs',
meta_keys=('img_id', 'img_path', 'ori_shape', 'img_shape', 'flip',
'flip_direction', 'texts'))
]
mosaic_affine_transform = [

dict(
type='MultiModalMosaic',
img_scale=base.img_scale,
pad_val=114.0,
pre_transform=base.pre_transform),
dict(
type='YOLOv5RandomAffine',
max_rotate_degree=0.0,
max_shear_degree=0.0,
max_aspect_ratio=100.,
scaling_ratio_range=(1 - base.affine_scale,
1 + base.affine_scale),
# img_scale is (width, height)
border=(-base.img_scale[0] // 2, -base.img_scale[1] // 2),
border_val=(114, 114, 114))
]
train_pipeline = [
*base.pre_transform,
mosaic_affine_transform,
dict(
type='YOLOv5MultiModalMixUp',
prob=base.mixup_prob,
pre_transform=[base.pre_transform,
*mosaic_affine_transform]),
*base.last_transform[:-1],
*text_transform
]
train_pipeline_stage2 = [
*base.train_pipeline_stage2[:-1],
*text_transform
]
【Test results】