in src/doc_builder/build_embeddings.py [0:0]
def create_chunks(package, doc_folder, page_info, version_tag_suffix, is_python_module) -> List[Chunk]:
"""
Build the MDX files for a given package.
Args:
package (`types.ModuleType`): The package where to look for objects to document.
doc_folder (`str` or `os.PathLike`): The folder where the doc source files are.
page_info (`Dict[str, str]`): Some information about the page.
version_tag_suffix (`str`, *optional*, defaults to `"src/"`):
Suffix to add after the version tag (e.g. 1.3.0 or main) in the documentation links.
For example, the default `"src/"` suffix will result in a base link as `https://github.com/huggingface/{package_name}/blob/{version_tag}/src/`.
For example, `version_tag_suffix=""` will result in a base link as `https://github.com/huggingface/{package_name}/blob/{version_tag}/`.
"""
doc_folder = Path(doc_folder)
anchor_mapping = {}
if "package_name" not in page_info:
page_info["package_name"] = package.__name__
chunks = []
all_files = list(doc_folder.glob("**/*"))
all_errors = []
for file in tqdm(all_files, desc="Building the chunks to embed"):
new_anchors = None
errors = None
page_info["path"] = file
try:
if file.suffix in [".md", ".mdx"]:
page_info["page"] = file.with_suffix("").relative_to(doc_folder).as_posix()
with open(file, "r", encoding="utf-8-sig") as reader:
content = reader.read()
content = clean_md(content)
content = process_md(content, page_info)
autodoc_chunks, new_anchors, errors = [], [], []
if "[[autodoc]]" in content:
autodoc_sections = get_autodoc_sections(content)
for section in autodoc_sections:
autodoc_content = "\n\n".join(_re_autodoc_all.findall(section["text"]))
if not autodoc_content.strip():
continue
_autodoc_chunks, _new_anchors, _errors = create_autodoc_chunks(
autodoc_content,
package,
return_anchors=True,
page_info=page_info,
version_tag_suffix=version_tag_suffix,
headings=section["headings"],
)
autodoc_chunks.extend(_autodoc_chunks)
new_anchors.extend(_new_anchors)
errors.extend(_errors)
content = _re_autodoc_all.sub("", content)
markdown_chunks = create_markdown_chunks(
content,
page_info=page_info,
)
# Make sure we clean up for next page.
del page_info["page"]
page_chunks = markdown_chunks + autodoc_chunks
if is_python_module:
page_chunks = [
chunk._replace(text=resolve_links_in_text(chunk.text, package, anchor_mapping, page_info))
for chunk in page_chunks
]
chunks.extend(page_chunks)
except Exception as e:
raise ChunkingError(f"There was an error when converting {file} to chunks to embed.\n" + e.args[0])
if new_anchors:
page_name = str(file.with_suffix("").relative_to(doc_folder))
for anchor in new_anchors:
if isinstance(anchor, tuple):
anchor_mapping.update(
{a: f"{page_name}#{anchor[0]}" for a in anchor[1:] if a not in anchor_mapping}
)
anchor = anchor[0]
anchor_mapping[anchor] = page_name
if errors:
all_errors.extend(errors)
if len(all_errors) > 0:
raise ValueError(
"The deployment of the documentation will fail because of the following errors:\n" + "\n".join(all_errors)
)
return chunks