in src/transformers/models/vidtr/vidtr_split.py [0:0]
def __init__(self, d_model=512, nhead=8, num_encoder_layers=6,
dim_feedforward=2048, dropout=0.1,
activation="relu", normalize_before=False, patch_size=(1, 16, 16),
in_channel=3, activity_num=157, merge_index=6,
temporal_size=16, merge_later=False):
super().__init__()
self.temporal_size = temporal_size
self.merge_later = merge_later
self.conv_stem = nn.Conv3d(in_channels=in_channel, out_channels=d_model, kernel_size=patch_size,
stride=patch_size, bias=True)
pos_embedding_layer_wise = True if self.temporal_size == 16 else False
layer_list = []
for i in range(num_encoder_layers):
module_temp = TransformerEncoderLayer(d_model, nhead, dim_feedforward,
dropout, activation, normalize_before,
merge_index=merge_index,
merge_later=merge_later,
layer_index=i,
pos_embedding_layer_wise=pos_embedding_layer_wise)
layer_list.append(module_temp)
encoder_layers = nn.ModuleList(layer_list)
encoder_norm = nn.LayerNorm(d_model) if normalize_before else None
self.encoder = TransformerEncoder(encoder_layers, num_encoder_layers, encoder_norm)
self._reset_parameters()
self.d_model = d_model
self.nhead = nhead
self.avg_pool = nn.AdaptiveAvgPool3d((1, 1, 1))
self.fc = nn.Linear(in_features=d_model, out_features=activity_num, bias=True)
if merge_later:
self.cls = nn.Parameter(torch.Tensor(1, 1, d_model))
self.pos_embedding = nn.Parameter(torch.Tensor(1, self.temporal_size * 14 * 14 + 1, self.d_model))
else:
self.pos_embedding = nn.Parameter(torch.Tensor(1, (self.temporal_size + 1) * (14 * 14 + 1), self.d_model))
self.cls_s = nn.Parameter(torch.Tensor(temporal_size, 1, d_model))
self.cls_t = nn.Parameter(torch.Tensor(1, 14 * 14 + 1, d_model))
self.dropout = nn.Dropout(0.5)
self.dp_pos = nn.Dropout(0.1)