def create_chunks()

in src/doc_builder/build_embeddings.py [0:0]


def create_chunks(package, doc_folder, page_info, version_tag_suffix, is_python_module) -> List[Chunk]:
    """
    Build the MDX files for a given package.

    Args:
        package (`types.ModuleType`): The package where to look for objects to document.
        doc_folder (`str` or `os.PathLike`): The folder where the doc source files are.
        page_info (`Dict[str, str]`): Some information about the page.
        version_tag_suffix (`str`, *optional*, defaults to `"src/"`):
            Suffix to add after the version tag (e.g. 1.3.0 or main) in the documentation links.
            For example, the default `"src/"` suffix will result in a base link as `https://github.com/huggingface/{package_name}/blob/{version_tag}/src/`.
            For example, `version_tag_suffix=""` will result in a base link as `https://github.com/huggingface/{package_name}/blob/{version_tag}/`.
    """
    doc_folder = Path(doc_folder)
    anchor_mapping = {}

    if "package_name" not in page_info:
        page_info["package_name"] = package.__name__

    chunks = []
    all_files = list(doc_folder.glob("**/*"))
    all_errors = []
    for file in tqdm(all_files, desc="Building the chunks to embed"):
        new_anchors = None
        errors = None
        page_info["path"] = file
        try:
            if file.suffix in [".md", ".mdx"]:
                page_info["page"] = file.with_suffix("").relative_to(doc_folder).as_posix()
                with open(file, "r", encoding="utf-8-sig") as reader:
                    content = reader.read()
                content = clean_md(content)
                content = process_md(content, page_info)

                autodoc_chunks, new_anchors, errors = [], [], []
                if "[[autodoc]]" in content:
                    autodoc_sections = get_autodoc_sections(content)
                    for section in autodoc_sections:
                        autodoc_content = "\n\n".join(_re_autodoc_all.findall(section["text"]))
                        if not autodoc_content.strip():
                            continue
                        _autodoc_chunks, _new_anchors, _errors = create_autodoc_chunks(
                            autodoc_content,
                            package,
                            return_anchors=True,
                            page_info=page_info,
                            version_tag_suffix=version_tag_suffix,
                            headings=section["headings"],
                        )
                        autodoc_chunks.extend(_autodoc_chunks)
                        new_anchors.extend(_new_anchors)
                        errors.extend(_errors)

                content = _re_autodoc_all.sub("", content)

                markdown_chunks = create_markdown_chunks(
                    content,
                    page_info=page_info,
                )

                # Make sure we clean up for next page.
                del page_info["page"]

                page_chunks = markdown_chunks + autodoc_chunks

                if is_python_module:
                    page_chunks = [
                        chunk._replace(text=resolve_links_in_text(chunk.text, package, anchor_mapping, page_info))
                        for chunk in page_chunks
                    ]

                chunks.extend(page_chunks)

        except Exception as e:
            raise ChunkingError(f"There was an error when converting {file} to chunks to embed.\n" + e.args[0])

        if new_anchors:
            page_name = str(file.with_suffix("").relative_to(doc_folder))
            for anchor in new_anchors:
                if isinstance(anchor, tuple):
                    anchor_mapping.update(
                        {a: f"{page_name}#{anchor[0]}" for a in anchor[1:] if a not in anchor_mapping}
                    )
                    anchor = anchor[0]
                anchor_mapping[anchor] = page_name

        if errors:
            all_errors.extend(errors)

    if len(all_errors) > 0:
        raise ValueError(
            "The deployment of the documentation will fail because of the following errors:\n" + "\n".join(all_errors)
        )

    return chunks