def create_reference_files()

in codegen_sources/model/src/evaluation/evaluator.py [0:0]


    def create_reference_files(self):
        """
        Create reference files for BLEU evaluation.
        """
        params = self.params
        for key, v in self.data["para"].items():
            span = None
            if len(key) == 3:
                lang1, lang2, span = key
            else:
                assert len(key) == 2
                lang1, lang2 = key
            assert lang1 < lang2, (lang1, lang2)

            for data_set in EVAL_DATASET_SPLITS:
                has_sent_ids = (data_set, (lang1, lang2)) in params.has_sentence_ids

                params.eval_scripts_folders[(lang1, lang2, data_set)] = os.path.join(
                    params.eval_scripts_root,
                    "{0}-{1}.{2}".format(lang1, lang2, data_set),
                )
                subprocess.Popen(
                    "mkdir -p %s"
                    % params.eval_scripts_folders[(lang1, lang2, data_set)],
                    shell=True,
                ).wait()
                params.eval_scripts_folders[(lang2, lang1, data_set)] = os.path.join(
                    params.eval_scripts_root,
                    "{0}-{1}.{2}".format(lang2, lang1, data_set),
                )
                subprocess.Popen(
                    "mkdir -p %s"
                    % params.eval_scripts_folders[(lang2, lang1, data_set)],
                    shell=True,
                ).wait()

                # define data paths
                lang1_path = os.path.join(
                    params.hyp_path,
                    "ref.{0}-{1}.{2}.txt".format(lang2, lang1, data_set),
                )
                lang2_path = os.path.join(
                    params.hyp_path,
                    "ref.{0}-{1}.{2}.txt".format(lang1, lang2, data_set),
                )
                spans_path = os.path.join(
                    params.hyp_path,
                    "ref.{0}-{1}-{3}.{2}.txt".format(lang1, lang2, span, data_set),
                )
                id_path = os.path.join(
                    params.hyp_path,
                    "ids.{0}-{1}.{2}.txt".format(lang1, lang2, data_set),
                )
                # store data paths
                params.ref_paths[(lang2, lang1, data_set)] = lang1_path
                params.ref_paths[(lang1, lang2, data_set)] = lang2_path
                params.id_paths[(lang1, lang2, data_set)] = id_path
                params.id_paths[(lang2, lang1, data_set)] = id_path

                # text sentences
                lang1_txt = []
                lang2_txt = []

                id_txt = []
                spans = []
                # convert to text
                for i, batch in enumerate(
                    self.get_iterator(data_set, lang1, lang2, span=span)
                ):
                    if len(batch) == 2:
                        (sent1, len1, id1, lenid1), (sent2, len2, id2, lenid2) = batch
                    else:
                        (
                            (sent1, len1, id1, lenid1),
                            (sent2, len2, id2, lenid2),
                            (span_batch, len_span, _, _),
                        ) = batch
                        spans.extend(list(span_batch.T))
                    lang1_txt.extend(convert_to_text(sent1, len1, self.dico, params))
                    lang2_txt.extend(convert_to_text(sent2, len2, self.dico, params))
                    if has_sent_ids:
                        assert id1.equal(id2) and lenid1.equal(lenid2)
                        id_txt.extend(convert_to_text(id1, lenid1, self.dico, params))

                # replace <unk> by <<unk>> as these tokens cannot be counted in BLEU
                lang1_txt = [x.replace("<unk>", "<<unk>>") for x in lang1_txt]
                lang2_txt = [x.replace("<unk>", "<<unk>>") for x in lang2_txt]

                # export hypothesis
                with open(lang1_path, "w", encoding="utf-8") as f:
                    f.write("\n".join(lang1_txt) + "\n")
                with open(lang2_path, "w", encoding="utf-8") as f:
                    f.write("\n".join(lang2_txt) + "\n")
                if len(spans) > 0:
                    with open(spans_path, "w", encoding="utf-8") as f:
                        f.write("\n".join([str(s) for s in spans]) + "\n")

                # restore original segmentation
                restore_segmentation(
                    lang1_path, roberta_mode=params.roberta_mode, single_line=True
                )
                restore_segmentation(
                    lang2_path, roberta_mode=params.roberta_mode, single_line=True
                )

                if has_sent_ids:
                    with open(id_path, "w", encoding="utf-8") as f:
                        f.write("\n".join(id_txt) + "\n")
                    restore_segmentation(
                        id_path, roberta_mode=params.roberta_mode, single_line=True
                    )