in fastchat/data/split_long_conversation.py [0:0]
def split_one_sample(sample):
tokenized_lens = []
conversations = sample["conversations"]
conversations = conversations[: len(conversations) // 2 * 2]
for c in conversations:
length = len(tokenizer(c["value"]).input_ids) + 6
tokenized_lens.append(length)
start_idx = 0
cur_len = 0
if len(conversations) % 2 != 0 or len(conversations) < 2:
return []
new_samples = []
for i in range(0, len(conversations), 2):
tmp_len = tokenized_lens[i] + tokenized_lens[i + 1]
if cur_len + tmp_len > max_length:
new_samples.append(make_sample(sample, start_idx, i))
start_idx = i
cur_len = 0
elif i == len(conversations) - 2:
new_samples.append(make_sample(sample, start_idx, i + 2))
cur_len += tmp_len
return new_samples