def chunks_to_embeddings()

in src/doc_builder/build_embeddings.py [0:0]


def chunks_to_embeddings(client, chunks, is_python_module) -> List[Embedding]:
    texts = []
    for c in chunks:
        prefix = f"Documentation of {'library' if is_python_module else 'service'} \"{c.package_name}\" under section: {' > '.join(c.headings)}"
        texts.append(prefix + "\n\n" + c.text)

    inference_output = client.feature_extraction(texts, truncate=True)
    inference_output = inference_output.tolist()

    embeddings = []
    for c, embed in zip(chunks, inference_output):
        headings = [None] * 5
        last_heading = None

        for heading_str in c.headings:
            level = heading_str.count("#")
            heading_text = heading_str.lstrip("# ").strip()
            if 1 <= level <= 5:
                headings[level - 1] = heading_text
                last_heading = heading_text

        # If the page does not have any heading, add the last heading to the page URL
        source_page_url = c.source_page_url
        if "#" not in c.source_page_url and last_heading is not None:
            source_page_url += "#" + slugify(last_heading)

        embeddings.append(
            Embedding(
                text=c.text,
                source_page_url=source_page_url,
                source_page_title=c.source_page_title,
                library=c.package_name,
                embedding=embed,
                heading1=headings[0],
                heading2=headings[1],
                heading3=headings[2],
                heading4=headings[3],
                heading5=headings[4],
            )
        )

    return embeddings