in src/transformers/models/vidtr/vidtr_compact.py [0:0]
def __init__(self, d_model=512, nhead=8,
num_encoder_layers=6,
dim_feedforward=2048,
dropout=0.1, activation="relu",
normalize_before=False, patch_size=(1, 16, 16),
in_channel=3, activity_num=157, temporal_size=16,
layer_pool=[1, 3, 5],
number_of_keys=[2, 2, 2]):
super().__init__()
self.temporal_size = temporal_size
self.conv_stem = nn.Conv3d(in_channels=in_channel, out_channels=d_model, kernel_size=patch_size,
stride=patch_size, bias=True)
layer_list = []
for i in range(num_encoder_layers):
if i in layer_pool:
is_pool = True
k = number_of_keys[layer_pool.index(i)]
else:
is_pool = False
k = 0
module_temp = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
dropout, activation, normalize_before,
layer_index=i, pool=is_pool, k=k,
layer_pool=layer_pool,
number_of_keys=number_of_keys)
layer_list.append(module_temp)
encoder_layers = nn.ModuleList(layer_list)
encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
self.encoder = TransformerEncoder(encoder_layers, num_encoder_layers, encoder_norm)
self._reset_parameters()
self.d_model = d_model
self.nhead = nhead
self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
self.fc = nn.Linear(in_features=d_model, out_features=activity_num, bias=True)
self.cls = nn.Parameter(torch.Tensor(1, 1, d_model))
self.pos_embedding_google = np.zeros((1, 14 * 14 + 1, d_model)).astype(np.float32)
self.pos_embedding = nn.Parameter(torch.Tensor(1, self.temporal_size * 14 * 14 + 1, self.d_model))
self.dropout = nn.Dropout(0.5)
self.dp_pos = nn.Dropout(0.1)