google/generativeai/notebook/sheets_sanitize_url.py (41 lines of code) (raw):

# -*- coding: utf-8 -*- # Copyright 2023 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Utilities for working with URLs.""" from __future__ import annotations import re from urllib import parse def _validate_url_part(part: str) -> None: if not re.fullmatch("[a-zA-Z0-9_-]*", part): raise ValueError('"{}" is outside the restricted character set'.format(part)) def _validate_url_query_or_fragment(part: str) -> None: for key, values in parse.parse_qs(part).items(): _validate_url_part(key) for value in values: _validate_url_part(value) def sanitize_sheets_url(url: str) -> str: """Sanitize a Sheets URL. Run some saftey checks to check whether `url` is a Sheets URL. This is not a general-purpose URL sanitizer. Rather, it makes use of the fact that we know the URL has to be for Sheets so we can make a few assumptions about (e.g. the domain). Args: url: The url to sanitize. Returns: The sanitized url. Raises: ValueError: If `url` does not match the expected restrictions for a Sheets URL. """ parse_result = parse.urlparse(url) if parse_result.scheme != "https": raise ValueError( 'Scheme for Sheets url must be "https", got "{}"'.format(parse_result.scheme) ) if parse_result.netloc not in ("docs.google.com", "sheets.googleapis.com"): raise ValueError( 'Domain for Sheets url must be "docs.google.com", got "{}"'.format(parse_result.netloc) ) # Path component. try: for fragment in parse_result.path.split("/"): _validate_url_part(fragment) except ValueError as exc: raise ValueError('Invalid path for Sheets url, got "{}"'.format(parse_result.path)) from exc # Params component. if parse_result.params: raise ValueError('Params component must be empty, got "{}"'.format(parse_result.params)) # Query component. try: _validate_url_query_or_fragment(parse_result.query) except ValueError as exc: raise ValueError( 'Invalid query for Sheets url, got "{}"'.format(parse_result.query) ) from exc # Fragment component. try: _validate_url_query_or_fragment(parse_result.fragment) except ValueError as exc: raise ValueError( 'Invalid fragment for Sheets url, got "{}"'.format(parse_result.fragment) ) from exc return url