def parse_documents()

in supporting-blog-content/github-assistant/index.py [0:0]


def parse_documents():
    owner = os.getenv("GITHUB_OWNER")
    repo = os.getenv("GITHUB_REPO")
    branch = os.getenv("GITHUB_BRANCH")
    base_path = os.getenv("BASE_PATH", "/tmp")

    if not owner or not repo:
        raise ValueError(
            "GITHUB_OWNER and GITHUB_REPO environment variables must be set."
        )

    local_repo_path = clone_repository(owner, repo, branch, base_path)

    nodes = []
    file_summary = []

    ts_parser = get_parser("typescript")
    py_parser = get_parser("python")
    go_parser = get_parser("go")
    js_parser = get_parser("javascript")
    bash_parser = get_parser("bash")
    yaml_parser = get_parser("yaml")

    parsers_and_extensions = [
        (SentenceSplitter(), [".md"]),
        (CodeSplitter(language="python", parser=py_parser), [".py", ".ipynb"]),
        (CodeSplitter(language="typescript", parser=ts_parser), [".ts"]),
        (CodeSplitter(language="go", parser=go_parser), [".go"]),
        (CodeSplitter(language="javascript", parser=js_parser), [".js"]),
        (CodeSplitter(language="bash", parser=bash_parser), [".bash", ",sh"]),
        (CodeSplitter(language="yaml", parser=yaml_parser), [".yaml", ".yml"]),
        (JSONNodeParser(), [".json"]),
    ]

    for parser, extensions in parsers_and_extensions:
        matching_files = []
        for ext in extensions:
            matching_files.extend(
                glob.glob(f"{local_repo_path}/**/*{ext}", recursive=True)
            )

        if len(matching_files) > 0:
            extension_list = ", ".join(extensions)
            file_summary.append(
                f"Found {len(matching_files)} {extension_list} files in the repository."
            )

            loader = SimpleDirectoryReader(
                input_dir=local_repo_path, required_exts=extensions, recursive=True
            )
            docs = loader.load_data()
            parsed_nodes = parser.get_nodes_from_documents(docs)

            print_docs_and_nodes(docs, parsed_nodes)

            nodes.extend(parsed_nodes)
        else:
            extension_list = ", ".join(extensions)
            file_summary.append(f"No {extension_list} files found in the repository.")

    collect_and_print_file_summary(file_summary)
    print("\n")
    return nodes