src/model/guesser_vilbert.py [68:187]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        logits = self.state_proj(fuesd_feats).squeeze(-1)
        
        if bboxs_mask is not None:
            logits[~bboxs_mask] = -1e10
        stat = self.softmax(logits)
        stat = (1-self.state_alpha) * curr_state + self.state_alpha * stat
        return stat, logits

    def forward_turn(
        self, 
        q, 
        ans, 
        cats, 
        img_feats,
        bboxs, 
        curr_state=None,
        bboxs_mask=None,
        token_type_ids=None,
        attention_mask=None,
        image_attention_mask=None,
        co_attention_mask=None,
        decode_mask=None,
        task_ids=None,
        output_all_encoded_layers=False,
        output_all_attention_masks=False,
        update_vilbert=True,
        ):
        if update_vilbert:
            seq_out_txt, seq_out_vis, pooled_out_txt, pooled_out_vis, _ = self.bert(
                q,
                img_feats,
                bboxs,
                token_type_ids,
                attention_mask,
                image_attention_mask,
                co_attention_mask,
                task_ids,
                output_all_encoded_layers=output_all_encoded_layers,
                output_all_attention_masks=output_all_attention_masks,
            )
        else:
            with torch.no_grad():
                # output of each  token, output of each object, output of cls token, output of global vis
                seq_out_txt, seq_out_vis, pooled_out_txt, pooled_out_vis, _ = self.bert(
                    q,
                    img_feats,
                    bboxs,
                    token_type_ids,
                    attention_mask,
                    image_attention_mask,
                    co_attention_mask,
                    task_ids,
                    output_all_encoded_layers=output_all_encoded_layers,
                    output_all_attention_masks=output_all_attention_masks,
                )
        ans = self.ans_embed(ans)
        cats = self.cat_embed(cats) if self.use_category else None
        if curr_state is None:
            curr_state = self.init_state(
                img_feats.size(0), img_feats.size(1), img_feats.device)
        stat, logits = self.compute_next_state(
            curr_state, seq_out_vis, pooled_out_txt, ans, cats, bboxs_mask)
        return stat, logits

    def forward_session(
        self, 
        qs, 
        ans, 
        end_turn,
        cats, 
        img_feats,
        bboxs, 
        bboxs_mask=None,
        token_type_ids=None,
        attention_mask=None,
        image_attention_mask=None,
        co_attention_mask=None,
        decode_mask=None,
        task_ids=None,
        output_all_encoded_layers=False,
        output_all_attention_masks=False,
        update_vilbert=True,
        return_state_history=False,
        ):

        stat = self.init_state(
            img_feats.size(0), img_feats.size(1), img_feats.device)
        batch_size = qs.size(0)
        max_turns = qs.size(1)
        
        stat_his = []
        final_logits = torch.zeros_like(stat)
        for t in range(max_turns):
            q_t = qs[:, t]
            ans_t = ans[:, t]
            next_stat, logits = self.forward_turn(
                q_t, ans_t, cats, img_feats, bboxs, 
                curr_state=stat,
                bboxs_mask=bboxs_mask,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask[:, t],
                image_attention_mask=image_attention_mask,
                co_attention_mask=co_attention_mask,
                decode_mask=decode_mask,
                task_ids=task_ids,
                output_all_encoded_layers=output_all_encoded_layers,
                output_all_attention_masks=output_all_attention_masks,
                update_vilbert=update_vilbert,
                )
            end = end_turn == t
            final_logits[end] = logits[end]
            stat = next_stat
            stat_his.append(stat)
        
        stat_his = torch.stack(stat_his).transpose(0, 1)
        if return_state_history:
            return final_logits[:, 1:], stat_his
        else:
            # First one is for global feat.
            return final_logits[:, 1:]
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -


src/model/qgen_vilbert.py [340:458]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        logits = self.state_proj(fuesd_feats).squeeze(-1)
        
        if bboxs_mask is not None:
            logits[~bboxs_mask] = -1e10
        stat = self.softmax(logits)
        stat = (1-self.state_alpha) * curr_state + self.state_alpha * stat
        return stat, logits

    def forward_turn(
        self, 
        q, 
        ans, 
        cats, 
        img_feats,
        bboxs, 
        curr_state=None,
        bboxs_mask=None,
        token_type_ids=None,
        attention_mask=None,
        image_attention_mask=None,
        co_attention_mask=None,
        decode_mask=None,
        task_ids=None,
        output_all_encoded_layers=False,
        output_all_attention_masks=False,
        update_vilbert=True,
        ):
        if update_vilbert:
            seq_out_txt, seq_out_vis, pooled_out_txt, pooled_out_vis, _ = self.bert(
                q,
                img_feats,
                bboxs,
                token_type_ids,
                attention_mask,
                image_attention_mask,
                co_attention_mask,
                task_ids,
                output_all_encoded_layers=output_all_encoded_layers,
                output_all_attention_masks=output_all_attention_masks,
            )
        else:
            with torch.no_grad():
                seq_out_txt, seq_out_vis, pooled_out_txt, pooled_out_vis, _ = self.bert(
                    q,
                    img_feats,
                    bboxs,
                    token_type_ids,
                    attention_mask,
                    image_attention_mask,
                    co_attention_mask,
                    task_ids,
                    output_all_encoded_layers=output_all_encoded_layers,
                    output_all_attention_masks=output_all_attention_masks,
                )
        ans = self.ans_embed(ans)
        cats = self.cat_embed(cats) if self.use_category else None
        if curr_state is None:
            curr_state = self.init_state(
                img_feats.size(0), img_feats.size(1), img_feats.device)
        stat, logits = self.compute_next_state(
            curr_state, seq_out_vis, pooled_out_txt, ans, cats, bboxs_mask)
        return stat, logits

    def forward_session(
        self, 
        qs, 
        ans, 
        end_turn,
        cats, 
        img_feats,
        bboxs, 
        bboxs_mask=None,
        token_type_ids=None,
        attention_mask=None,
        image_attention_mask=None,
        co_attention_mask=None,
        decode_mask=None,
        task_ids=None,
        output_all_encoded_layers=False,
        output_all_attention_masks=False,
        update_vilbert=True,
        return_state_history=False,
        ):

        stat = self.init_state(
            img_feats.size(0), img_feats.size(1), img_feats.device)
        batch_size = qs.size(0)
        max_turns = qs.size(1)
        
        stat_his = []
        final_logits = torch.zeros_like(stat)
        for t in range(max_turns):
            q_t = qs[:, t]
            ans_t = ans[:, t]
            next_stat, logits = self.forward_turn(
                q_t, ans_t, cats, img_feats, bboxs, 
                curr_state=stat,
                bboxs_mask=bboxs_mask,
                token_type_ids=token_type_ids,
                attention_mask=attention_mask[:, t],
                image_attention_mask=image_attention_mask,
                co_attention_mask=co_attention_mask,
                decode_mask=decode_mask,
                task_ids=task_ids,
                output_all_encoded_layers=output_all_encoded_layers,
                output_all_attention_masks=output_all_attention_masks,
                update_vilbert=update_vilbert,
                )
            end = end_turn == t
            final_logits[end] = logits[end]
            stat = next_stat
            stat_his.append(stat)
        
        stat_his = torch.stack(stat_his).transpose(0, 1)
        if return_state_history:
            return final_logits[:, 1:], stat_his
        else:
            # First one is for global feat.
            return final_logits[:, 1:]
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -