in src/tab_title_tuning_data.py [0:0]
def gen_data_multi_document(self, dataset: pd.DataFrame, limit=0, low_counts_fraction=LOW_COUNTS_FRACTION):
def split_array(arr: List[any], split_index):
one = arr[:split_index]
two = arr[split_index:]
return one, two
test_ids = dataset["test_set_id"].unique().tolist()
num_total_items = len(test_ids)
if limit > 0:
num_total_items = min(num_total_items, limit)
test_ids = test_ids[:num_total_items]
random.shuffle(test_ids)
num_one_and_two_article = int(len(test_ids) * low_counts_fraction)
one_article_test_ids, test_ids = split_array(test_ids, num_one_and_two_article)
two_article_test_ids, test_ids = split_array(test_ids, num_one_and_two_article)
print(one_article_test_ids)
print(two_article_test_ids)
print(test_ids)
results = []
print("*** Setting up 1 articles in cluster ")
results.extend(self.compute_training_data_for_tests(dataset, one_article_test_ids, 1))
print("*** Setting up 2 articles in cluster ")
results.extend(self.compute_training_data_for_tests(dataset, two_article_test_ids, 2))
print(f"*** {self.num_representative_docs} articles in cluster ")
results.extend(
self.compute_training_data_for_tests(dataset, test_ids, self.num_representative_docs))
return pd.DataFrame(results)