in easycv/models/backbones/vision_transformer.py [0:0]
def __init__(self,
img_size=[224],
patch_size=16,
in_chans=3,
num_classes=1000,
embed_dim=768,
depth=12,
num_heads=12,
mlp_ratio=4.,
qkv_bias=False,
qk_scale=None,
drop_rate=0.,
attn_drop_rate=0.,
drop_path_rate=0.,
norm_layer=partial(nn.LayerNorm, eps=1e-6),
global_pool=False,
use_layer_scale=False,
init_scale=1e-4,
hydra_attention=False,
hydra_attention_layers=None,
use_dpr_linspace=True,
**kwargs):
super().__init__()
if hydra_attention:
if hydra_attention_layers is None:
hydra_attention_layers = depth
elif hydra_attention_layers > depth:
raise ValueError(
'When using Hydra Attention, hydra_attention_Layers must be smaller than or equal to depth.'
)
self.num_features = self.embed_dim = embed_dim
self.num_heads = num_heads
self.mlp_ratio = mlp_ratio
self.qkv_bias = qkv_bias
self.qk_scale = qk_scale
self.drop_rate = drop_rate
self.attn_drop_rate = attn_drop_rate
self.norm_layer = norm_layer
self.use_layer_scale = use_layer_scale
self.init_scale = init_scale
self.hydra_attention = hydra_attention
self.hydra_attention_layers = hydra_attention_layers
self.drop_path_rate = drop_path_rate
self.depth = depth
self.patch_embed = PatchEmbed(
img_size=img_size[0],
patch_size=patch_size,
in_chans=in_chans,
embed_dim=embed_dim)
num_patches = self.patch_embed.num_patches
self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim))
self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim))
self.pos_drop = nn.Dropout(p=drop_rate)
if use_dpr_linspace:
dpr = [
x.item()
for x in torch.linspace(0, self.drop_path_rate, self.depth)
]
else:
dpr = [drop_path_rate for x in range(self.depth)]
self.dpr = dpr
if self.hydra_attention:
hy = [
x >= (self.depth - self.hydra_attention_layers)
for x in range(self.depth)
]
head = [
self.embed_dim if x >=
(self.depth - self.hydra_attention_layers) else self.num_heads
for x in range(self.depth)
]
else:
hy = [False for x in range(self.depth)]
head = [self.num_heads for x in range(self.depth)]
self.blocks = nn.ModuleList([
Block(
dim=embed_dim,
num_heads=head[i],
mlp_ratio=mlp_ratio,
qkv_bias=qkv_bias,
qk_scale=qk_scale,
drop=drop_rate,
attn_drop=attn_drop_rate,
drop_path=dpr[i],
norm_layer=norm_layer,
use_layer_scale=use_layer_scale,
init_values=init_scale,
hydra_attention=hy[i]) for i in range(depth)
])
self.norm = norm_layer(embed_dim)
# Classifier head
self.head = nn.Linear(
embed_dim, num_classes) if num_classes > 0 else nn.Identity()
# Use global average pooling
self.global_pool = global_pool
if self.global_pool:
self.fc_norm = norm_layer(embed_dim)
self.norm = None