in model.py [0:0]
def run_phi(self,
kp_loc,
kp_vis,
class_mask=None,
):
preds = {}
# batch size
ba = kp_loc.shape[0]
dtype = kp_loc.type()
kp_loc_orig = kp_loc.clone()
if self.z_augment and self.training:
R_rand = rand_rot(ba,
dtype=dtype,
max_rot_angle=float(self.z_augment_rot_angle),
axes=(0, 0, 1))
kp_loc_in = torch.bmm(R_rand[:, 0:2, 0:2], kp_loc)
else:
R_rand = torch.eye(3).type(dtype)[None].repeat((ba, 1, 1))
kp_loc_in = kp_loc_orig
if self.z_equivariance and self.training:
# random xy rot
R_rand_eq = rand_rot(ba,
dtype=dtype,
max_rot_angle=float(
self.z_equivariance_rot_angle),
axes=(0, 0, 1))
kp_loc_in = torch.cat(
(kp_loc_in,
torch.bmm(R_rand_eq[:, 0:2, 0:2], kp_loc_in)
), dim=0)
kp_vis_in = kp_vis.repeat((2, 1))
else:
kp_vis_in = kp_vis
# mask kp_loc by kp_visibility
kp_loc_masked = kp_loc_in * kp_vis_in[:, None, :]
# vectorize
kp_loc_flatten = kp_loc_masked.view(-1, 2*self.n_keypoints)
# concatenate visibilities and kp locations
l1_input = torch.cat((kp_loc_flatten, kp_vis_in), dim=1)
# pass to network
if self.independent_phi_for_aug and l1_input.shape[0] == 2*ba:
feats = torch.cat([self.phi(l1_[:, :, None, None]) for
l1_ in l1_input.split(ba, dim=0)], dim=0)
else:
feats = self.phi(l1_input[:, :, None, None])
# coefficients into the linear basis
shape_coeff = self.alpha_layer(feats)[:, :, 0, 0]
if self.z_equivariance and self.training:
# use the shape coeff from the second set of preds
shape_coeff = shape_coeff[ba:]
# take the feats from the first set
feats = feats[:ba]
# shape prediction is just a linear layer implemented as a conv
shape_canonical = self.shape_layer(
shape_coeff[:, :, None, None])[:, :, 0, 0]
shape_canonical = shape_canonical.view(ba, 3, self.n_keypoints)
if self.keypoint_norm_type == 'to_root':
# make sure we fix the root at 0
root_j = shape_canonical[:, :, self.root_joint]
shape_canonical = shape_canonical - root_j[:, :, None]
# predict camera params
# ... log rotation (exponential representation)
R_log = self.rot_layer(feats)[:, :, 0, 0]
# convert from the 3D to 3x3 rot matrix
R = so3_exponential_map(R_log)
# T vector of the camera
if self.camera_translation:
T = self.translation_layer(feats)[:, :, 0, 0]
if self.camera_xy_translation: # kill the last z-dim
T = T * torch.tensor([1., 1., 0.]).type(dtype)[None, :]
else:
T = R_log.new_zeros(ba, 3)
# offset the translation vector of the camera
if self.depth_offset > 0.:
T[:, 2] = T[:, 2] + self.depth_offset
# scale of the camera
if self.camera_scale:
scale = self.scale_layer(feats)[:, 0, 0, 0]
else:
scale = R_log.new_ones(ba)
# rotated+scaled shape into the camera ( Y = sRX + T )
shape_camera_coord = self.apply_similarity_t(
shape_canonical, R, T, scale)
# undo equivariant transformation
if (self.z_equivariance or self.z_augment) and self.training:
R_rand_inv = R_rand.transpose(2, 1)
R = torch.bmm(R_rand_inv, R)
T = torch.bmm(R_rand_inv, T[:, :, None])[:, :, 0]
shape_camera_coord = torch.bmm(R_rand_inv, shape_camera_coord)
# estimate translation
if self.argmin_translation:
assert self.projection_type == 'orthographic'
projection, _ = self.camera_projection(shape_camera_coord)
T_amin = argmin_translation(projection, kp_loc_orig, v=kp_vis)
T_amin = Fu.pad(T_amin, (0, 1), 'constant', float(0))
shape_camera_coord = shape_camera_coord + T_amin[:, :, None]
T = T + T_amin
if class_mask is not None:
shape_camera_coord = shape_camera_coord * class_mask[:, None, :]
shape_canonical = shape_canonical * class_mask[:, None, :]
preds['R_log'] = R_log
preds['R'] = R
preds['scale'] = scale
preds['T'] = T
preds['shape_camera_coord'] = shape_camera_coord
preds['shape_coeff'] = shape_coeff
preds['shape_canonical'] = shape_canonical
return preds