in preprocess/fewshot_gym_dataset.py [0:0]
def generate_k_shot_data(self, k, seed, path=None):
"""
generate a k-shot (k) dataset using random seed (seed)
return train, dev, test
"""
if self.hf_identifier not in config_dict:
return None, None, None
if use_instruct and self.hf_identifier not in prompt_names_per_task:
return None, None, None
if do_train:
if seed<100:
return None, None, None
k = 16384
elif do_test:
k = 16
# load dataset
dataset = self.load_dataset()
# formulate into list (for consistency in np.random)
train_lines, test_lines = self.get_train_test_lines(dataset)
# shuffle the data
np.random.seed(seed)
np.random.shuffle(train_lines)
# Get label list for balanced sampling
label_list = {}
for line in train_lines:
label = "all"
if label not in label_list:
label_list[label] = [line]
else:
label_list[label].append(line)
# make train, dev, test data
k_shot_train = []
for label in label_list:
for line in label_list[label][:k]:
k_shot_train.append(line)
k_shot_dev = []
for label in label_list:
for line in label_list[label][k:2*k]:
k_shot_dev.append(line)
k_shot_test = test_lines
# save to path
self.save(path, k, seed, k_shot_train, k_shot_dev, k_shot_test)
return k_shot_train, k_shot_dev, k_shot_test