def run_thread()

in scripts/create_kilt_data_paragraphs.py [0:0]


def run_thread(args):
    documents = args["documents"]
    nlp = args["nlp"]
    id = args["id"]
    rank = args["rank"]
    chunk_size = args["chunk_size"]

    if id == 0 and rank == 0:
        iter_ = tqdm(documents)
    else:
        iter_ = documents

    # initialization
    output = []

    for document in iter_:

        # initialization
        buffer = []
        section = "Section::::Abstract"

        # loop paragrpahs removing first (title)
        for paragraph_id, paragraph in enumerate(nlp.pipe(document["text"][1:]), 1):

            # if section then save name and move on
            if "Section::::" in paragraph.text:
                section = paragraph.text.strip()
                continue

            for sentence in paragraph.sents:
                if buffer and len(buffer) + len(sentence) >= chunk_size:
                    # create new chunk
                    new_chunk = create_chunk(
                        document, buffer, paragraph_id, paragraph, section
                    )
                    output.append(new_chunk)
                    buffer = []

                for token in sentence:
                    word = token.text.strip()
                    if word and len(word) > 0:
                        buffer.append(token)

            if buffer:
                # create new chunk
                new_chunk = create_chunk(
                    document, buffer, paragraph_id, paragraph, section
                )

                # conditions on merging with previous chunk
                if (
                    output
                    and document["wikipedia_id"] == output[-1]["wikipedia_id"]
                    and section == output[-1]["section"]
                    and len(buffer) + output[-1]["tmp_len"] < chunk_size
                ):

                    # adjusting anchors offsets
                    for anchor in new_chunk["anchors"]:
                        anchor["start"] += len(output[-1]["text"]) + 1
                        anchor["end"] += len(output[-1]["text"]) + 1

                    # appending new data
                    output[-1]["text"] += " " + new_chunk["text"]
                    output[-1]["anchors"] += new_chunk["anchors"]
                    output[-1]["sources"] += new_chunk["sources"]
                    output[-1]["tmp_len"] += new_chunk["tmp_len"] + 1
                else:
                    output.append(new_chunk)
                buffer = []

    for out in output:
        del out["tmp_len"]

    return output