grolp/readers/alfred.py [1312:1428]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            turn_tokens = self._tokenizer.add_special_tokens(task_desc_tokens, high_desc_tokens)

            high_descs_tokens.append(TextField(turn_tokens))

        for i in range(len(low_to_high)):
            # we now extract the language instruction associated with the current action
            language_instructions.append(high_descs_tokens[low_to_high[i]])

            # if we are at the beginning of a new language instruction we mark it as 1, 0 otherwise
            if i == len(low_to_high) - 1:
                start_instr_labels.append(1.0)
            else:
                if low_to_high[i + 1] != low_to_high[i]:
                    start_instr_labels.append(1.0)
                else:
                    start_instr_labels.append(0.0)

        start_instr_labels = TensorField(torch.tensor(start_instr_labels, dtype=torch.int32), padding_value=0)
        language_instructions = ListField(language_instructions)

        num_low_actions = len(ex['plan']['low_actions']) + 1  # +1 for additional stop action

        assert num_low_actions == len(actions_low), "Number of actions is not correct!"
        # This should be a list of
        object_features = []

        rotation_steps = 360 // self.rotation_angle
        visual_attention_mask = []
        interact_object_masks = []
        obj_interact_targets = []

        features_root = os.path.join(root, self.vis_feats_path)

        if not os.path.exists(features_root):
            raise ValueError(f"The visual features path {features_root} does not exist!")

        for i in range(num_low_actions):
            curr_step_features = []
            # always include an extra slot for the ResNet feature associated with the front view
            curr_visual_attention_mask = []
            curr_objects_masks = []

            for j in range(rotation_steps):
                feature_path = os.path.join(features_root, f"{i}-{j}.npz")

                if os.path.exists(feature_path):
                    with np.load(feature_path) as f_features:
                        features, attn_mask, masks = self.extract_visual_features(f_features)

                    check_nan(features,
                              f"MaskRCNN features -- ID: {metadata['task_id']} -- "
                              f"Type: {metadata['task_type']} -- Rotation step: {j}")

                    curr_step_features.append(features)
                    curr_objects_masks.append(masks)

                    curr_visual_attention_mask.extend(attn_mask)
                else:
                    # handles cases where we don't have information for the current trajectory step (--rare--)
                    curr_step_features.append(np.zeros((self.max_objects_per_frame, VISUAL_EMB_SIZE), dtype=np.float32))
                    curr_objects_masks.append(
                        np.zeros((self.max_objects_per_frame, self.frame_size, self.frame_size), dtype=np.uint8))
                    curr_visual_attention_mask.extend([0] * self.max_objects_per_frame)

            curr_step_features = ArrayField(np.concatenate(curr_step_features, 0))
            curr_visual_attention_mask = ArrayField(np.array(curr_visual_attention_mask, dtype=np.uint8))
            object_features.append(curr_step_features)
            visual_attention_mask.append(curr_visual_attention_mask)
            interact_object_masks.append(curr_objects_masks)

            # derive gold targets for mask interaction
            if actions_low[i]["mask"] is not None:
                # we extract the gold mask for the current trajectory
                gold_mask = actions_low[i]["mask"].astype(np.bool)

                # we compute the IoU between the gold mask and the target
                front_view_masks = curr_objects_masks[0]
                iou_scores = [mask_iou(gold_mask, mask) for mask in front_view_masks]
                label = np.argmax(iou_scores)
            else:
                # we don't need a target for this step
                label = -100

            obj_interact_targets.append(label)

        scene_objects_features = ListField(object_features)
        visual_attention_mask = ListField(visual_attention_mask)

        episode_mask = ArrayField(np.ones((num_low_actions,), dtype=np.uint8))

        actions_low_field = ListField([LabelField(a['action'], "low_action_labels") for a in actions_low])

        # metadata["interactive_object_masks"] = interact_object_masks

        obj_interact_targets = torch.tensor(obj_interact_targets, dtype=torch.int64)
        obj_interact_mask = (obj_interact_targets != -100)
        obj_interact_targets = TensorField(obj_interact_targets, padding_value=-100)
        obj_interact_mask = TensorField(obj_interact_mask, dtype=torch.bool)

        metadata_field = MetadataField(metadata)

        instance = Instance(dict(
            metadata=metadata_field,
            instructions=language_instructions,
            start_instr_labels=start_instr_labels,
            actions=actions_low_field,
            visual_features=scene_objects_features,
            visual_attention_mask=visual_attention_mask,
            actions_mask=episode_mask,
            obj_interact_targets=obj_interact_targets,
            obj_interact_mask=obj_interact_mask
        ))

        with self.lmdb_env.begin(write=True, buffers=True) as txt:
            txt.put(instance_cache_key, pickle.dumps(instance))

        return instance
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


grolp/readers/alfred.py [1526:1642]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
            turn_tokens = self._tokenizer.add_special_tokens(task_desc_tokens, high_desc_tokens)

            high_descs_tokens.append(TextField(turn_tokens))

        for i in range(len(low_to_high)):
            # we now extract the language instruction associated with the current action
            language_instructions.append(high_descs_tokens[low_to_high[i]])

            # if we are at the beginning of a new language instruction we mark it as 1, 0 otherwise
            if i == len(low_to_high) - 1:
                start_instr_labels.append(1.0)
            else:
                if low_to_high[i + 1] != low_to_high[i]:
                    start_instr_labels.append(1.0)
                else:
                    start_instr_labels.append(0.0)

        start_instr_labels = TensorField(torch.tensor(start_instr_labels, dtype=torch.int32), padding_value=0)
        language_instructions = ListField(language_instructions)

        num_low_actions = len(ex['plan']['low_actions']) + 1  # +1 for additional stop action

        assert num_low_actions == len(actions_low), "Number of actions is not correct!"
        # This should be a list of
        object_features = []

        rotation_steps = 360 // self.rotation_angle
        visual_attention_mask = []
        interact_object_masks = []
        obj_interact_targets = []

        features_root = os.path.join(root, self.vis_feats_path)

        if not os.path.exists(features_root):
            raise ValueError(f"The visual features path {features_root} does not exist!")

        for i in range(num_low_actions):
            curr_step_features = []
            # always include an extra slot for the ResNet feature associated with the front view
            curr_visual_attention_mask = []
            curr_objects_masks = []

            for j in range(rotation_steps):
                feature_path = os.path.join(features_root, f"{i}-{j}.npz")

                if os.path.exists(feature_path):
                    with np.load(feature_path) as f_features:
                        features, attn_mask, masks = self.extract_visual_features(f_features)

                    check_nan(features,
                              f"MaskRCNN features -- ID: {metadata['task_id']} -- "
                              f"Type: {metadata['task_type']} -- Rotation step: {j}")

                    curr_step_features.append(features)
                    curr_objects_masks.append(masks)

                    curr_visual_attention_mask.extend(attn_mask)
                else:
                    # handles cases where we don't have information for the current trajectory step (--rare--)
                    curr_step_features.append(np.zeros((self.max_objects_per_frame, VISUAL_EMB_SIZE), dtype=np.float32))
                    curr_objects_masks.append(
                        np.zeros((self.max_objects_per_frame, self.frame_size, self.frame_size), dtype=np.uint8))
                    curr_visual_attention_mask.extend([0] * self.max_objects_per_frame)

            curr_step_features = ArrayField(np.concatenate(curr_step_features, 0))
            curr_visual_attention_mask = ArrayField(np.array(curr_visual_attention_mask, dtype=np.uint8))
            object_features.append(curr_step_features)
            visual_attention_mask.append(curr_visual_attention_mask)
            interact_object_masks.append(curr_objects_masks)

            # derive gold targets for mask interaction
            if actions_low[i]["mask"] is not None:
                # we extract the gold mask for the current trajectory
                gold_mask = actions_low[i]["mask"].astype(np.bool)

                # we compute the IoU between the gold mask and the target
                front_view_masks = curr_objects_masks[0]
                iou_scores = [mask_iou(gold_mask, mask) for mask in front_view_masks]
                label = np.argmax(iou_scores)
            else:
                # we don't need a target for this step
                label = -100

            obj_interact_targets.append(label)

        scene_objects_features = ListField(object_features)
        visual_attention_mask = ListField(visual_attention_mask)

        episode_mask = ArrayField(np.ones((num_low_actions,), dtype=np.uint8))

        actions_low_field = ListField([LabelField(a['action'], "low_action_labels") for a in actions_low])

        # metadata["interactive_object_masks"] = interact_object_masks

        obj_interact_targets = torch.tensor(obj_interact_targets, dtype=torch.int64)
        obj_interact_mask = (obj_interact_targets != -100)
        obj_interact_targets = TensorField(obj_interact_targets, padding_value=-100)
        obj_interact_mask = TensorField(obj_interact_mask, dtype=torch.bool)

        metadata_field = MetadataField(metadata)

        instance = Instance(dict(
            metadata=metadata_field,
            instructions=language_instructions,
            start_instr_labels=start_instr_labels,
            actions=actions_low_field,
            visual_features=scene_objects_features,
            visual_attention_mask=visual_attention_mask,
            actions_mask=episode_mask,
            obj_interact_targets=obj_interact_targets,
            obj_interact_mask=obj_interact_mask
        ))

        with self.lmdb_env.begin(write=True, buffers=True) as txt:
            txt.put(instance_cache_key, pickle.dumps(instance))

        return instance
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -