in pytorchvideo/layers/accelerator/mobile_cpu/attention.py [0:0]
def convert(self, input_blob_size, **kwargs):
"""
Converts into efficient version of squeeze-excite (SE) for CPU.
It changes conv in original SE into linear layer (better supported by CPU).
"""
if self.is_3d:
avg_pool = nn.AdaptiveAvgPool3d(1)
else:
avg_pool = nn.AdaptiveAvgPool2d(1)
"""
Reshape tensor size to (B, C) for linear layer.
"""
reshape0 = _Reshape((input_blob_size[0], input_blob_size[1]))
fc0 = nn.Linear(
self.se.block[0].in_channels,
self.se.block[0].out_channels,
bias=(not (self.se.block[0].bias is None)),
)
state_dict_fc0 = deepcopy(self.se.block[0].state_dict())
state_dict_fc0["weight"] = state_dict_fc0["weight"].squeeze()
fc0.load_state_dict(state_dict_fc0)
activation = deepcopy(self.se.block[1])
fc1 = nn.Linear(
self.se.block[2].in_channels,
self.se.block[2].out_channels,
bias=(not (self.se.block[2].bias is None)),
)
state_dict_fc1 = deepcopy(self.se.block[2].state_dict())
state_dict_fc1["weight"] = state_dict_fc1["weight"].squeeze()
fc1.load_state_dict(state_dict_fc1)
sigmoid = deepcopy(self.se.block[3])
"""
Output of linear layer has output shape of (B, C). Need to reshape to proper
shape before multiplying with input tensor.
"""
reshape_size_after_sigmoid = (input_blob_size[0], input_blob_size[1], 1, 1) + (
(1,) if self.is_3d else ()
)
reshape1 = _Reshape(reshape_size_after_sigmoid)
se_layers = nn.Sequential(
avg_pool, reshape0, fc0, activation, fc1, sigmoid, reshape1
)
# Add final elementwise multiplication and replace self.se
self.se = _SkipConnectMul(se_layers)
self.convert_flag = True