in src/models/struxgpt_base.py [0:0]
def chunk_content(self, content: Union[List[str], str],
max_length=1024, prefix='chunk',
force_chunk=False) -> List[Dict[str, str]]:
chunk_list_total = []
if isinstance(content, list):
for di, _content in enumerate(content):
if is_empty(_content):
continue
chunk_list_total.extend(
self.chunk_content(_content, max_length=max_length,
prefix=f'{prefix}_{di}',
force_chunk=force_chunk)
)
else:
cur_chunk_list, cur_tokens = [], 0
for para in content.splitlines():
if is_empty(para):
continue
tokens = self.count_token(para) + 1
if tokens > max_length * 1.0 and force_chunk: # TODO: force_chunk
# print(f'Warning: long paragraph with {tokens} tokens.')
sentences = self.split_to_sentence(para)
tmp_sent_list = []
for sent in sentences:
sent_token = self.count_token(sent) + 1
# assume a sentence would not exceed `max_length`
if sent_token + cur_tokens > max_length:
cur_chunk_list.append(' '.join(tmp_sent_list))
chunk_list_total.append({
'idx': f'{prefix}_{len(chunk_list_total)}', 'data': '\n'.join(cur_chunk_list)
})
cur_chunk_list, cur_tokens, tmp_sent_list = [], 0, []
tmp_sent_list.append(sent)
cur_tokens += sent_token
if len(tmp_sent_list):
cur_chunk_list.append(' '.join(tmp_sent_list))
continue # avoid redundancy
if not force_chunk:
if (tokens + cur_tokens > max_length) and (tokens < 64 or cur_tokens > max_length) and len(cur_chunk_list):
chunk_list_total.append({
'idx': f'{prefix}_{len(chunk_list_total)}', 'data': '\n'.join(cur_chunk_list)
})
cur_chunk_list, cur_tokens = [], 0
else:
if tokens + cur_tokens > max_length:
assert len(cur_chunk_list)
title_candidate = cur_chunk_list[-1]
last_chunk_is_title = title_candidate[-1] not in PUNCTUATION and \
self.count_token(title_candidate) < 32
if last_chunk_is_title:
cur_chunk_list = cur_chunk_list[:-1]
if len(cur_chunk_list):
chunk_list_total.append({
'idx': f'{prefix}_{len(chunk_list_total)}', 'data': '\n'.join(cur_chunk_list)
})
cur_chunk_list, cur_tokens = [], 0
if last_chunk_is_title:
cur_chunk_list.append(title_candidate)
cur_tokens = self.count_token(title_candidate) + 1
cur_chunk_list.append(para)
cur_tokens += tokens
if len(cur_chunk_list):
chunk_list_total.append({
'idx': f'{prefix}_{len(chunk_list_total)}', 'data': '\n'.join(cur_chunk_list)
})
return chunk_list_total