models/cfg/vanilla/visdrone_rtmdetm.yaml (38 lines of code) (raw):
# parameters
nc: 10 # number of classes
# depth_multiple: 1.33 # model depth multiple
# width_multiple: 1.25 # layer channel multiple
# # anchors 1920
# anchors:
# - [8,10, 10,22, 20,18] # P3/8
# - [23,40, 49,28, 41,59] # P4/16
# - [93,48, 81,93, 170,140] # P5/32
depth_multiple: 0.67 # model depth multiple
width_multiple: 0.75 # layer channel multiple
# anchors 1536
anchors:
- [6,8, 10,22, 19,15] # P3/8
- [19,32, 39,22, 32,47] # P4/16
- [74,38, 65,74, 136,112] # P5/32
# YOLOv5 backbone
backbone:
# [from, number, module, args]
[[-1, 1, Conv, [32, 3, 2]],
[-1, 1, Conv, [32, 3, 1]],
[-1, 1, Conv, [64, 3, 1]], # 2-P1/2
[-1, 1, Conv, [128, 3, 2]],
[-1, 3, RTMDetCSPLayer, [128, True, True]], # 4-P2/4
[-1, 1, Conv, [256, 3, 2]],
[-1, 6, RTMDetCSPLayer, [256, True, True]], # 6-P3/8
###
[-1, 1, Conv, [512, 3, 2]],
[-1, 6, RTMDetCSPLayer, [512, True, True]], # 8-P4/16
[-1, 1, Conv, [1024, 3, 2]],
[-1, 1, SPP, [1024, [5, 9, 13]]],
[-1, 3, RTMDetCSPLayer, [1024, False, True]], # 11-P5/32
]
# YOLOv5 head
head:
[[-1, 1, Conv, [512, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 8], 1, Concat, [1]], # cat backbone P4
[-1, 3, RTMDetCSPLayer, [512, False, False]], # 15
[-1, 1, Conv, [256, 1, 1]],
[-1, 1, nn.Upsample, [None, 2, 'nearest']],
[[-1, 6], 1, Concat, [1]], # cat backbone P3
[-1, 3, RTMDetCSPLayer, [256, False, False]], # 19 (P3/8-small)
[-1, 1, Conv, [256, 3, 2]],
[[-1, 16], 1, Concat, [1]], # cat head P4
[-1, 3, RTMDetCSPLayer, [512, False, False]], # 22 (P4/16-medium)
[-1, 1, Conv, [512, 3, 2]],
[[-1, 12], 1, Concat, [1]], # cat head P5
[-1, 3, RTMDetCSPLayer, [1024, False, False]], # 25 (P5/32-large)
[[19, 22, 25], 1, Detect, [nc, anchors]], # Detect(P3, P4, P5)
]