data_validation/util.py (50 lines of code) (raw):

# Copyright 2024 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import json import logging import re import time from data_validation import exceptions from typing import TYPE_CHECKING if TYPE_CHECKING: import ibis.expr.types.Table def timed_call(log_txt, fn, *args, **kwargs): t0 = time.time() result = fn(*args, **kwargs) elapsed = time.time() - t0 logging.debug(f"{log_txt} elapsed: {round(elapsed, 2)}s") return result def split_not_in_quotes( to_split: str, sep: str = " ", exclude_empty_tokens: bool = False ) -> list: """Split a string by a separator but only when the separator is not inside quotes. re pattern taken from this comment: https://stackoverflow.com/a/2787979/10979853 The commenter's words should the link ever go stale: Each time it finds a semicolon, the lookahead scans the entire remaining string, making sure there's an even number of single-quotes and an even number of double-quotes. (Single-quotes inside double-quoted fields, or vice-versa, are ignored.) If the lookahead succeeds, the semicolon is a delimiter. The pattern doesn't cope with whitespace as sep, back to back spaces are multiple seps, therefore we have exclude_empty_tokens parameter. """ pattern = r"""%(sep)s(?=(?:[^'"]|'[^']*'|"[^"]*")*$)""" % {"sep": sep} if exclude_empty_tokens: return [t for t in re.split(pattern, to_split) if t] else: return re.split(pattern, to_split) def dvt_config_string_to_dict(config_string: str) -> dict: """Convert JSON in a string to a dict.""" if not config_string: return None if isinstance(config_string, dict): return config_string try: param_dict = json.loads(config_string.replace("'", '"')) return param_dict except json.JSONDecodeError as exc: raise exceptions.ValidationException( f"Invalid JSON format in connection parameter dictionary string: {config_string}" ) from exc def ibis_table_to_sql(ibis_table: "ibis.expr.types") -> str: """Function to generate the SQL string for the table based on the backend""" sql_alchemy_clients = [ "mysql", "oracle", "postgres", "db2", "mssql", "redshift", "snowflake", ] # If the backend uses sqlalchemy, we will need to request sqla to bind variables # for a non sqlalchemy backend, the parameters are already bound backend_name = ibis_table._find_backend().name return ( ibis_table.compile().compile(compile_kwargs={"literal_binds": True}).string if backend_name in sql_alchemy_clients else ibis_table.compile() )