def gen_data_multi_document()

in src/tab_title_tuning_data.py [0:0]


    def gen_data_multi_document(self, dataset: pd.DataFrame, limit=0, low_counts_fraction=LOW_COUNTS_FRACTION):
        def split_array(arr: List[any], split_index):
            one = arr[:split_index]
            two = arr[split_index:]
            return one, two

        test_ids = dataset["test_set_id"].unique().tolist()
        num_total_items = len(test_ids)
        if limit > 0:
            num_total_items = min(num_total_items, limit)
            test_ids = test_ids[:num_total_items]

        random.shuffle(test_ids)
        num_one_and_two_article = int(len(test_ids) * low_counts_fraction)

        one_article_test_ids, test_ids = split_array(test_ids, num_one_and_two_article)
        two_article_test_ids, test_ids = split_array(test_ids, num_one_and_two_article)

        print(one_article_test_ids)
        print(two_article_test_ids)
        print(test_ids)

        results = []
        print("*** Setting up 1 articles in cluster ")
        results.extend(self.compute_training_data_for_tests(dataset, one_article_test_ids, 1))

        print("*** Setting up 2 articles in cluster ")
        results.extend(self.compute_training_data_for_tests(dataset, two_article_test_ids, 2))

        print(f"*** {self.num_representative_docs} articles in cluster ")
        results.extend(
            self.compute_training_data_for_tests(dataset, test_ids, self.num_representative_docs))
        return pd.DataFrame(results)