in pytorchvideo/models/head.py [0:0]
def forward(self, x: torch.Tensor, bboxes: torch.Tensor) -> torch.Tensor:
"""
Args:
x (torch.tensor): input tensor
bboxes (torch.tensor): Accociated bounding boxes.
The format is N*5 (Index, X_1,Y_1,X_2,Y_2) if using RoIAlign
and N*6 (Index, x_ctr, y_ctr, width, height, angle_degrees) if
using RoIAlignRotated.
"""
# Performs 3d pooling.
if self.pool is not None:
x = self.pool(x)
# Performs roi layer using bboxes
if self.roi_layer is not None:
temporal_dim = x.shape[-3]
if temporal_dim != 1:
raise Exception(
"Temporal dimension should be 1. Consider modifying the pool layer."
)
x = torch.squeeze(x, -3)
x = self.roi_layer(x, bboxes)
# Performs spatial 2d pooling.
if self.pool_spatial is not None:
x = self.pool_spatial(x)
x = x.unsqueeze(-3)
# Performs dropout.
if self.dropout is not None:
x = self.dropout(x)
# Performs projection.
if self.proj is not None:
x = x.permute((0, 2, 3, 4, 1))
x = self.proj(x)
x = x.permute((0, 4, 1, 2, 3))
# Performs activation.
if self.activation is not None:
x = self.activation(x)
if self.output_pool is not None:
# Performs global averaging.
x = self.output_pool(x)
x = x.view(x.shape[0], -1)
return x