in src/doc_builder/build_embeddings.py [0:0]
def chunks_to_embeddings(client, chunks, is_python_module) -> List[Embedding]:
texts = []
for c in chunks:
prefix = f"Documentation of {'library' if is_python_module else 'service'} \"{c.package_name}\" under section: {' > '.join(c.headings)}"
texts.append(prefix + "\n\n" + c.text)
inference_output = client.feature_extraction(texts, truncate=True)
inference_output = inference_output.tolist()
embeddings = []
for c, embed in zip(chunks, inference_output):
headings = [None] * 5
last_heading = None
for heading_str in c.headings:
level = heading_str.count("#")
heading_text = heading_str.lstrip("# ").strip()
if 1 <= level <= 5:
headings[level - 1] = heading_text
last_heading = heading_text
# If the page does not have any heading, add the last heading to the page URL
source_page_url = c.source_page_url
if "#" not in c.source_page_url and last_heading is not None:
source_page_url += "#" + slugify(last_heading)
embeddings.append(
Embedding(
text=c.text,
source_page_url=source_page_url,
source_page_title=c.source_page_title,
library=c.package_name,
embedding=embed,
heading1=headings[0],
heading2=headings[1],
heading3=headings[2],
heading4=headings[3],
heading5=headings[4],
)
)
return embeddings