in scripts/create_kilt_data_paragraphs.py [0:0]
def run_thread(args):
documents = args["documents"]
nlp = args["nlp"]
id = args["id"]
rank = args["rank"]
chunk_size = args["chunk_size"]
if id == 0 and rank == 0:
iter_ = tqdm(documents)
else:
iter_ = documents
# initialization
output = []
for document in iter_:
# initialization
buffer = []
section = "Section::::Abstract"
# loop paragrpahs removing first (title)
for paragraph_id, paragraph in enumerate(nlp.pipe(document["text"][1:]), 1):
# if section then save name and move on
if "Section::::" in paragraph.text:
section = paragraph.text.strip()
continue
for sentence in paragraph.sents:
if buffer and len(buffer) + len(sentence) >= chunk_size:
# create new chunk
new_chunk = create_chunk(
document, buffer, paragraph_id, paragraph, section
)
output.append(new_chunk)
buffer = []
for token in sentence:
word = token.text.strip()
if word and len(word) > 0:
buffer.append(token)
if buffer:
# create new chunk
new_chunk = create_chunk(
document, buffer, paragraph_id, paragraph, section
)
# conditions on merging with previous chunk
if (
output
and document["wikipedia_id"] == output[-1]["wikipedia_id"]
and section == output[-1]["section"]
and len(buffer) + output[-1]["tmp_len"] < chunk_size
):
# adjusting anchors offsets
for anchor in new_chunk["anchors"]:
anchor["start"] += len(output[-1]["text"]) + 1
anchor["end"] += len(output[-1]["text"]) + 1
# appending new data
output[-1]["text"] += " " + new_chunk["text"]
output[-1]["anchors"] += new_chunk["anchors"]
output[-1]["sources"] += new_chunk["sources"]
output[-1]["tmp_len"] += new_chunk["tmp_len"] + 1
else:
output.append(new_chunk)
buffer = []
for out in output:
del out["tmp_len"]
return output