core/github.py (54 lines of code) (raw):
# Copyright 2025 DeepMind Technologies Limited. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
"""Processor for extracting file content from GitHub."""
from collections.abc import AsyncIterable
import dataclasses
from urllib.parse import urlparse
import dataclasses_json
from genai_processors import content_api
from genai_processors import processor
import httpx
GITHUB_URL_MIMETYPE = 'application/json; type=GithubUrl'
@dataclasses_json.dataclass_json
@dataclasses.dataclass(frozen=True)
class GithubUrl(dataclasses_json.DataClassJsonMixin):
url: str
def parse_github_url(url: str) -> dict[str, str] | None:
"""Parses a GitHub file URL to extract owner, repo, ref (branch/tag/commit),
and file path.
Example URL: https://github.com/owner/repo/blob/main/path/to/file.txt
Args:
url: URL of file on GitHub.
Returns:
dict of URL component to value, if the URL can be parsed; otherwise None.
"""
url_object = urlparse(url)
path_parts = [part for part in url_object.path.split('/') if part]
if (
url_object.hostname != 'github.com'
or len(path_parts) < 5
or path_parts[2] != 'blob'
):
return None
[owner, repo, _, ref] = path_parts[:4]
file_path = '/'.join(path_parts[4:])
return {'owner': owner, 'repo': repo, 'ref': ref, 'path': file_path}
async def fetch_url(api_key: str, parsed_url: dict[str, str]) -> str | None:
"""Fetches GitHub content from a URL."""
# See https://docs.github.com/en/rest/repos/contents#get-repository-content
api_url = (
f'https://api.github.com/repos/{parsed_url["owner"]}/{parsed_url["repo"]}'
f'/contents/{parsed_url["path"]}'
)
headers = {
# Get raw content directly, not JSON with base64 encoded content.
'Accept': 'application/vnd.github.raw',
'Authorization': f'Bearer {api_key}',
'X-GitHub-Api-Version': '2022-11-28',
}
params = {'ref': parsed_url['ref']}
async with httpx.AsyncClient() as client:
response = await client.get(api_url, headers=headers, params=params)
response.raise_for_status()
return response.text
class GithubProcessor(processor.PartProcessor):
"""Processor for extracting file content from GitHub.
Example usage:
```python
from genai_processors import content_api
from genai_processors.core import github
from genai_processors.core import text
processor = (
text.UrlExtractor({'https://github.com': GithubUrl}) +
github.GithubProcessor(api_key="YOUR_GITHUB_API_KEY")
)
url = content_api.Content(
'https://github.com/owner/repo/blob/main/path/to/file.txt'
)
async for part in processor(url):
# Will print the contents of file.txt, followed by "Some other text".
print(part.text)
```
"""
def __init__(self, api_key: str):
self._api_key = api_key
def match(self, part: content_api.ProcessorPart) -> bool:
return part.mimetype == GITHUB_URL_MIMETYPE
async def call(
self,
part: content_api.ProcessorPart,
) -> AsyncIterable[content_api.ProcessorPart]:
"""Fetches content from a Github URL."""
url = GithubUrl.from_json(part.text)
parsed_url = parse_github_url(url.url)
if not parsed_url:
return
content = await fetch_url(self._api_key, parsed_url)
yield content_api.ProcessorPart(content)