configs/segmentation/upernet/upernet_r50_512x512_8xb4_60e_voc12.py (138 lines of code) (raw):
_base_ = ['configs/base.py']
CLASSES = [
'background', 'aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus',
'car', 'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'
]
# model settings
num_classes = 21
# norm_cfg = dict(type='SyncBN', requires_grad=True) # multi gpus
norm_cfg = dict(type='BN', requires_grad=True)
model = dict(
type='EncoderDecoder',
pretrained='open-mmlab://resnet50_v1c',
backbone=dict(
type='ResNetV1c',
depth=50,
num_stages=4,
out_indices=(1, 2, 3, 4),
dilations=(1, 1, 1, 1),
strides=(1, 2, 2, 2),
norm_cfg=norm_cfg,
norm_eval=False,
style='pytorch',
contract_dilation=True,
),
decode_head=dict(
type='UPerHead',
in_channels=[256, 512, 1024, 2048],
in_index=[0, 1, 2, 3],
pool_scales=(1, 2, 3, 6),
channels=512,
dropout_ratio=0.1,
num_classes=num_classes,
norm_cfg=norm_cfg,
align_corners=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
auxiliary_head=dict(
type='FCNHead',
in_channels=1024,
in_index=2,
channels=256,
num_convs=1,
concat_input=False,
dropout_ratio=0.1,
num_classes=21,
norm_cfg=norm_cfg,
align_corners=False,
loss_decode=dict(
type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
# model training and testing settings
train_cfg=dict(),
test_cfg=dict(mode='whole'))
# dataset settings
dataset_type = 'SegDataset'
data_type = 'SegSourceRaw'
data_root = 'data/VOCdevkit/VOC2012'
train_img_root = data_root + 'JPEGImages'
train_label_root = data_root + 'SegmentationClass'
train_list_file = data_root + 'ImageSets/Segmentation/train.txt'
val_img_root = data_root + 'JPEGImages'
val_label_root = data_root + 'SegmentationClass'
val_list_file = data_root + 'ImageSets/Segmentation/val.txt'
test_batch_size = 2
img_norm_cfg = dict(
mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
img_scale = (512, 512)
train_pipeline = [
dict(type='MMResize', img_scale=img_scale, ratio_range=(0.5, 2.0)),
dict(type='SegRandomCrop', crop_size=(512, 512), cat_max_ratio=0.75),
dict(type='MMRandomFlip', flip_ratio=0.5),
dict(type='MMPhotoMetricDistortion'),
dict(type='MMNormalize', **img_norm_cfg),
dict(type='MMPad', size=(512, 512)),
dict(type='DefaultFormatBundle'),
dict(
type='Collect',
keys=['img', 'gt_semantic_seg'],
meta_keys=('filename', 'ori_filename', 'ori_shape', 'img_shape',
'pad_shape', 'scale_factor', 'flip', 'flip_direction',
'img_norm_cfg')),
]
test_pipeline = [
dict(
type='MMMultiScaleFlipAug',
img_scale=img_scale,
# img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
flip=False,
transforms=[
dict(type='MMResize', keep_ratio=True),
dict(type='MMRandomFlip'),
dict(type='MMNormalize', **img_norm_cfg),
dict(type='ImageToTensor', keys=['img']),
dict(
type='Collect',
keys=['img'],
meta_keys=('filename', 'ori_filename', 'ori_shape',
'img_shape', 'pad_shape', 'scale_factor', 'flip',
'flip_direction', 'img_norm_cfg')),
])
]
data = dict(
imgs_per_gpu=4,
workers_per_gpu=4,
train=dict(
type=dataset_type,
ignore_index=255,
data_source=dict(
type=data_type,
img_root=train_img_root,
label_root=train_label_root,
split=train_list_file,
classes=CLASSES),
pipeline=train_pipeline),
val=dict(
imgs_per_gpu=test_batch_size,
ignore_index=255,
type=dataset_type,
data_source=dict(
type=data_type,
img_root=val_img_root,
label_root=val_label_root,
split=val_list_file,
classes=CLASSES,
),
pipeline=test_pipeline))
# optimizer
optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
optimizer_config = dict()
# learning policy
lr_config = dict(policy='poly', power=0.9, min_lr=1e-4, by_epoch=True)
# runtime settings
total_epochs = 60
checkpoint_config = dict(interval=1)
eval_config = dict(interval=1, gpu_collect=False)
eval_pipelines = [
dict(
mode='test',
evaluators=[
dict(
type='SegmentationEvaluator',
classes=CLASSES,
metric_names=['mIoU'])
],
)
]