in models/base_model.py [0:0]
def forward_singlecrop(self, video, target_shape=None):
"""
Args:
video (torch.Tensor, Bx#clipsxCxTxHxW)
target_shape: The shape of the target. Some of these layers might
be able to use this information.
"""
outputs = {}
aux_losses = {}
batch_size = video.size(0)
num_clips = video.size(1)
# Fold the clips dimension into the batch for feature extraction, upto
# temporal aggregation
video = video.flatten(0, 1)
feats = self.backbone(video)
outputs['backbone'] = feats
# Spatial mean
feats = torch.mean(feats, [-1, -2])
# store temporal mean as well
outputs['backbone_mean'] = torch.mean(feats, [-1])
# If it's not sequential and can be applied here
if len(self.project_mlp) > 0 and (outputs['backbone_mean'].size(-1) ==
self.project_mlp[0].in_features):
outputs['backbone_mean_projected'] = self.project_mlp(
outputs['backbone_mean'])
# Move the time dimension inside: B,C,T -> B,T,C
feats = feats.permute((0, 2, 1))
# Map the feats to intermediate dimension, that rest of the code
# will operate on. Only if the original feature is not already
if feats.shape[-1] != self.cfg.intermediate_featdim:
assert self.mapper_to_inter is not None, (
f'The backbone feat does not match intermediate {feats.shape} '
f'and {self.cfg.intermediate_featdim}. Please set '
f'model.backbone_dim correctly.')
feats = self.mapper_to_inter(feats)
feats_agg, agg_losses = self.temporal_aggregator(feats)
aux_losses.update(agg_losses)
feats_agg = self.reset_temp_agg_feat_dim(feats_agg)
outputs['temp_agg'] = feats_agg
# For the contrastive loss, I need a projected version of this feature
outputs['temp_agg_projected'] = self.project_mlp(feats_agg)
# Now before future prediction, need to unfold the clips back out,
# and concat on the temporal dimension
if num_clips > 1:
assert (
(feats_agg.ndim == 2)
or (feats_agg.ndim == 3 and feats_agg.size(1) == 1)
), ('Should be using some temporal aggregation when using clips')
feats_agg = feats_agg.reshape((batch_size, num_clips) +
feats_agg.shape[1:])
if feats_agg.ndim == 4:
feats_agg = torch.flatten(feats_agg, 1, 2)
# now feats_agg back to 3D (B, T, F)
feats_past = feats_agg
# Now the future prediction, also it might update the past features
# like the GPT style models would
(feats_past, feats_future, future_losses,
endpoints) = self.future_predictor(feats_past, target_shape)
aux_losses.update(future_losses)
outputs.update(endpoints)
outputs['future'] = feats_future
outputs['past'] = feats_past
# Apply a classifier on the past features, might be supervising that
if self.cfg.classifier_on_past:
feats_past_drop = self.dropout(feats_past)
outputs.update(
self._apply_classifier(feats_past_drop,
outputs_prefix=PAST_LOGITS_PREFIX))
# For the contrastive loss, I need a projected version of this feature
outputs['future_projected'] = self.project_mlp(feats_agg)
# Aggregate again, if asked for
feats_future_agg, future_agg_losses = (
self.temporal_aggregator_after_future_pred(feats_future))
aux_losses.update(future_agg_losses)
outputs['future_agg'] = feats_future_agg
feats_future_agg_drop = self.dropout(feats_future_agg)
outputs.update(self._apply_classifier(feats_future_agg_drop))
if self.regression_head:
outputs['logits_regression'] = self.regression_head(
feats_future_agg_drop)
return outputs, aux_losses