packages/blueprints/gen-ai-chatbot/static-assets/chatbot-genai-components/backend/python/embedding/loaders/url.py (66 lines of code) (raw):

import logging import urllib.error import urllib.request from typing import Literal from embedding.loaders.base import BaseLoader, Document from embedding.loaders.playwright import ( DelayUnstructuredHtmlEvaluator, PlaywrightURLLoader, ) from embedding.loaders.unstructured import UnstructuredURLLoader from embedding.loaders.youtube import YoutubeLoaderWithLangDetection, _parse_video_id logger = logging.getLogger(__name__) # Delay seconds to wait for the page to render by JavaScript. DELAY_SEC = 2 def get_loader(loader_type: str, urls: list[str]) -> BaseLoader: map = { "web": PlaywrightURLLoader( urls=urls, evaluator=DelayUnstructuredHtmlEvaluator(delay_sec=DELAY_SEC) ), "unstructured": UnstructuredURLLoader(urls, request_timeout=30), "youtube": YoutubeLoaderWithLangDetection(urls), } return map[loader_type] def check_content_type(url) -> Literal["web", "unstructured", "youtube"]: if _parse_video_id(url): return "youtube" # Using urllib.request instead of requests to avoid 403 # Ref: https://stackoverflow.com/questions/74446830/how-to-fix-403-forbidden-errors-with-python-requests-even-with-user-agent-head req = urllib.request.Request(url, method="HEAD") req.add_header( "User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:106.0) Gecko/20100101 Firefox/106.0", ) req.add_header("Accept", "*/*") req.add_header("Accept-Language", "*") try: with urllib.request.urlopen(req, timeout=30) as response: content_type = response.headers.get("Content-Type", "").lower() except Exception as e: logger.warning( f"Failed to get content type of {url}: {e}. Use unstructured to load." ) return "unstructured" if "text/html" in content_type: return "web" else: return "unstructured" def group_urls_by_content_type(urls: list[str]) -> dict: res: dict = { "web": [], "unstructured": [], "youtube": [], } for url in urls: content_type = check_content_type(url) res[content_type].append(url) return res class UrlLoader(BaseLoader): """Loads a document from a URL.""" def __init__(self, urls: list[str]): self._urls = urls def load(self) -> list[Document]: res = [] categorized_urls = group_urls_by_content_type(self._urls) logger.info(f"URLs are categorized as: {categorized_urls}") for loader_type, urls in categorized_urls.items(): loader = get_loader(loader_type, urls) documents = loader.load() res.extend(documents) return res