in supporting-blog-content/github-assistant/index.py [0:0]
def parse_documents():
owner = os.getenv("GITHUB_OWNER")
repo = os.getenv("GITHUB_REPO")
branch = os.getenv("GITHUB_BRANCH")
base_path = os.getenv("BASE_PATH", "/tmp")
if not owner or not repo:
raise ValueError(
"GITHUB_OWNER and GITHUB_REPO environment variables must be set."
)
local_repo_path = clone_repository(owner, repo, branch, base_path)
nodes = []
file_summary = []
ts_parser = get_parser("typescript")
py_parser = get_parser("python")
go_parser = get_parser("go")
js_parser = get_parser("javascript")
bash_parser = get_parser("bash")
yaml_parser = get_parser("yaml")
parsers_and_extensions = [
(SentenceSplitter(), [".md"]),
(CodeSplitter(language="python", parser=py_parser), [".py", ".ipynb"]),
(CodeSplitter(language="typescript", parser=ts_parser), [".ts"]),
(CodeSplitter(language="go", parser=go_parser), [".go"]),
(CodeSplitter(language="javascript", parser=js_parser), [".js"]),
(CodeSplitter(language="bash", parser=bash_parser), [".bash", ",sh"]),
(CodeSplitter(language="yaml", parser=yaml_parser), [".yaml", ".yml"]),
(JSONNodeParser(), [".json"]),
]
for parser, extensions in parsers_and_extensions:
matching_files = []
for ext in extensions:
matching_files.extend(
glob.glob(f"{local_repo_path}/**/*{ext}", recursive=True)
)
if len(matching_files) > 0:
extension_list = ", ".join(extensions)
file_summary.append(
f"Found {len(matching_files)} {extension_list} files in the repository."
)
loader = SimpleDirectoryReader(
input_dir=local_repo_path, required_exts=extensions, recursive=True
)
docs = loader.load_data()
parsed_nodes = parser.get_nodes_from_documents(docs)
print_docs_and_nodes(docs, parsed_nodes)
nodes.extend(parsed_nodes)
else:
extension_list = ", ".join(extensions)
file_summary.append(f"No {extension_list} files found in the repository.")
collect_and_print_file_summary(file_summary)
print("\n")
return nodes