in src/doc_builder/build_embeddings.py [0:0]
def get_chunks(self, page_info, chunk_len_chars, headings=[]):
chunks = []
headings = headings + [self.heading]
split_content = self.split_markdown(self.content)
if not len(split_content):
return []
chunk_str = ""
for content in split_content:
if len(chunk_str) > chunk_len_chars:
chunks.append(
Chunk(
text=chunk_str.strip(),
source_page_url=f"https://huggingface.co/docs/{page_info['package_name']}/{page_info['page']}#{self.anchor}",
source_page_title=get_page_title(page_info["page"]),
package_name=page_info["package_name"],
headings=headings,
)
)
chunk_str = ""
chunk_str += content + " "
if len(chunk_str):
chunks.append(
Chunk(
text=chunk_str.strip(),
source_page_url=f"https://huggingface.co/docs/{page_info['package_name']}/{page_info['page']}#{self.anchor}",
source_page_title=get_page_title(page_info["page"]),
package_name=page_info["package_name"],
headings=headings,
)
)
for child in self.children:
child_chunks = child.get_chunks(page_info, chunk_len_chars, headings=headings)
chunks.extend(child_chunks)
return chunks