core/github.py

# Copyright 2025 DeepMind Technologies Limited. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================== """Processor for extracting file content from GitHub.""" from collections.abc import AsyncIterable import dataclasses from urllib.parse import urlparse import dataclasses_json from genai_processors import content_api from genai_processors import processor import httpx GITHUB_URL_MIMETYPE = 'application/json; type=GithubUrl' @dataclasses_json.dataclass_json @dataclasses.dataclass(frozen=True) class GithubUrl(dataclasses_json.DataClassJsonMixin): url: str def parse_github_url(url: str) -> dict[str, str] | None: """Parses a GitHub file URL to extract owner, repo, ref (branch/tag/commit), and file path. Example URL: https://github.com/owner/repo/blob/main/path/to/file.txt Args: url: URL of file on GitHub. Returns: dict of URL component to value, if the URL can be parsed; otherwise None. """ url_object = urlparse(url) path_parts = [part for part in url_object.path.split('/') if part] if ( url_object.hostname != 'github.com' or len(path_parts) < 5 or path_parts[2] != 'blob' ): return None [owner, repo, _, ref] = path_parts[:4] file_path = '/'.join(path_parts[4:]) return {'owner': owner, 'repo': repo, 'ref': ref, 'path': file_path} async def fetch_url(api_key: str, parsed_url: dict[str, str]) -> str | None: """Fetches GitHub content from a URL.""" # See https://docs.github.com/en/rest/repos/contents#get-repository-content api_url = ( f'https://api.github.com/repos/{parsed_url["owner"]}/{parsed_url["repo"]}' f'/contents/{parsed_url["path"]}' ) headers = { # Get raw content directly, not JSON with base64 encoded content. 'Accept': 'application/vnd.github.raw', 'Authorization': f'Bearer {api_key}', 'X-GitHub-Api-Version': '2022-11-28', } params = {'ref': parsed_url['ref']} async with httpx.AsyncClient() as client: response = await client.get(api_url, headers=headers, params=params) response.raise_for_status() return response.text class GithubProcessor(processor.PartProcessor): """Processor for extracting file content from GitHub. Example usage: ```python from genai_processors import content_api from genai_processors.core import github from genai_processors.core import text processor = ( text.UrlExtractor({'https://github.com': GithubUrl}) + github.GithubProcessor(api_key="YOUR_GITHUB_API_KEY") ) url = content_api.Content( 'https://github.com/owner/repo/blob/main/path/to/file.txt' ) async for part in processor(url): # Will print the contents of file.txt, followed by "Some other text". print(part.text) ``` """ def __init__(self, api_key: str): self._api_key = api_key def match(self, part: content_api.ProcessorPart) -> bool: return part.mimetype == GITHUB_URL_MIMETYPE async def call( self, part: content_api.ProcessorPart, ) -> AsyncIterable[content_api.ProcessorPart]: """Fetches content from a Github URL.""" url = GithubUrl.from_json(part.text) parsed_url = parse_github_url(url.url) if not parsed_url: return content = await fetch_url(self._api_key, parsed_url) yield content_api.ProcessorPart(content)

core/github.py (54 lines of code) (raw):