perfkitbenchmarker/edw

# Copyright 2017 PerfKitBenchmarker Authors. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Resource encapsulating provisioned Data Warehouse in the cloud Services. Classes to wrap specific backend services are in the corresponding provider directory as a subclass of BaseEdwService. """ import os from typing import Any, Dict, List from absl import flags from perfkitbenchmarker import resource flags.DEFINE_integer( 'edw_service_cluster_concurrency', 5, 'Number of queries to run concurrently on the cluster.', ) flags.DEFINE_string( 'edw_service_cluster_snapshot', None, 'If set, the snapshot to restore as cluster.', ) flags.DEFINE_string( 'edw_service_cluster_identifier', None, 'If set, the preprovisioned edw cluster.', ) flags.DEFINE_string( 'edw_service_endpoint', None, 'If set, the preprovisioned edw cluster endpoint.', ) flags.DEFINE_string( 'edw_service_cluster_db', None, 'If set, the db on cluster to use during the benchmark (' 'only applicable when using snapshots).', ) flags.DEFINE_string( 'edw_service_cluster_user', None, 'If set, the user authorized on cluster (only applicable ' 'when using snapshots).', ) flags.DEFINE_string( 'edw_service_cluster_password', None, 'If set, the password authorized on cluster (only ' 'applicable when using snapshots).', ) flags.DEFINE_string( 'snowflake_snowsql_config_override_file', None, 'The SnowSQL configuration to use.' 'https://docs.snowflake.net/manuals/user-guide/snowsql-config.html#snowsql-config-file', ) # pylint: disable=line-too-long flags.DEFINE_string( 'snowflake_connection', None, 'Named Snowflake connection defined in SnowSQL config file.' 'https://docs.snowflake.net/manuals/user-guide/snowsql-start.html#using-named-connections', ) # pylint: disable=line-too-long flags.DEFINE_integer( 'edw_suite_iterations', 1, 'Number of suite iterations to perform.' ) flags.DEFINE_integer( 'edw_suite_warmup_iterations', None, 'Number of suite warmup iterations to perform.', ) # TODO(user): Revisit flags for accepting query lists. flags.DEFINE_string( 'edw_simultaneous_queries', None, 'CSV list of simultaneous queries to benchmark.', ) flags.DEFINE_integer( 'edw_simultaneous_query_submission_interval', '0', 'Simultaneous query submission interval in milliseconds.', ) flags.DEFINE_string( 'edw_power_queries', None, 'CSV list of power queries to benchmark.' ) flags.DEFINE_multi_string( 'concurrency_streams', [], 'List of all query streams to execute. Each ' 'stream should be passed in separately and the queries should be comma ' 'separated, e.g. --concurrency_streams=1,2,3 --concurrency_streams=3,2,1', ) flags.DEFINE_string( 'snowflake_warehouse', None, 'A virtual warehouse, often referred to simply as a - ' 'warehouse, is a cluster of compute in Snowflake. ' 'https://docs.snowflake.com/en/user-guide/warehouses.html', ) # pylint: disable=line-too-long flags.DEFINE_string( 'snowflake_database', None, 'The hosted snowflake database to use during the benchmark.', ) flags.DEFINE_string( 'snowflake_schema', None, 'The schema of the hosted snowflake database to use during the benchmark.', ) flags.DEFINE_enum( 'snowflake_client_interface', 'JDBC', ['JDBC'], 'The Runtime Interface used when interacting with Snowflake.', ) flags.DEFINE_boolean( 'edw_get_service_auxiliary_metrics', 'False', 'If set, the benchmark will collect service-specific metrics from the' ' remote service after the benchmark has completed. Additional delay may be' ' incurred due to the need to wait for metadata propogation.', ) flags.DEFINE_enum( 'edw_bq_feature_config', 'default', ['default', 'job_optional'], 'Selects from among various BigQuery feature configurations. ' 'Currently supported: default (no special features), job_optional ' '(enables job_creation_optional query preview feature). ' 'Only supported for Python client.', ) FLAGS = flags.FLAGS TYPE_2_PROVIDER = dict([ ('athena', 'aws'), ('redshift', 'aws'), ('spectrum', 'aws'), ('snowflake_aws', 'aws'), ('snowflake_azure', 'azure'), ('snowflakeexternal_aws', 'aws'), ('snowflakeicebergexternal_aws', 'aws'), ('snowflakeicebergmanaged_aws', 'aws'), ('snowflakeexternal_azure', 'azure'), ('bigquery', 'gcp'), ('endor', 'gcp'), ('endorazure', 'gcp'), ('bqfederated', 'gcp'), ('azuresqldatawarehouse', 'azure'), ]) TYPE_2_MODULE = dict([ ('athena', 'perfkitbenchmarker.providers.aws.athena'), ('redshift', 'perfkitbenchmarker.providers.aws.redshift'), ('spectrum', 'perfkitbenchmarker.providers.aws.spectrum'), ('snowflake_aws', 'perfkitbenchmarker.providers.aws.snowflake_aws'), ( 'snowflake_azure', 'perfkitbenchmarker.providers.azure.snowflake_azure', ), ( 'snowflakeexternal_aws', 'perfkitbenchmarker.providers.aws.snowflake_aws', ), ( 'snowflakeicebergexternal_aws', 'perfkitbenchmarker.providers.aws.snowflake_aws', ), ( 'snowflakeicebergmanaged_aws', 'perfkitbenchmarker.providers.aws.snowflake_aws', ), ( 'snowflakeexternal_azure', 'perfkitbenchmarker.providers.azure.snowflake_azure', ), ('bigquery', 'perfkitbenchmarker.providers.gcp.bigquery'), ('endor', 'perfkitbenchmarker.providers.gcp.bigquery'), ('endorazure', 'perfkitbenchmarker.providers.gcp.bigquery'), ('bqfederated', 'perfkitbenchmarker.providers.gcp.bigquery'), ( 'azuresqldatawarehouse', 'perfkitbenchmarker.providers.azure.azure_sql_data_warehouse', ), ]) DEFAULT_NUMBER_OF_NODES = 1 # The order of stages is important to the successful lifecycle completion. EDW_SERVICE_LIFECYCLE_STAGES = ['create', 'load', 'query', 'delete'] SAMPLE_QUERY_PATH = '/tmp/sample.sql' SAMPLE_QUERY = 'select * from INFORMATION_SCHEMA.TABLES;' class EdwExecutionError(Exception): """Encapsulates errors encountered during execution of a query.""" class EdwClientInterface: """Defines the interface for EDW service clients. Attributes: client_vm: An instance of virtual_machine.BaseVirtualMachine used to interface with the edw service. whitelist_ip: The IP to whitelist. """ def __init__(self): self.client_vm = None # set by derived classes self.whitelist_ip = None def SetProvisionedAttributes(self, benchmark_spec): """Sets any attributes that were unknown during initialization.""" self.client_vm = benchmark_spec.vms[0] self.client_vm.RemoteCommand( 'echo "\nMaxSessions 100" | sudo tee -a /etc/ssh/sshd_config' ) def Prepare(self, package_name: str) -> None: """Prepares the client vm to execute query. The default implementation raises an Error, to ensure client specific installation and authentication of runner utilities. Args: package_name: String name of the package defining the preprovisioned data (certificates, etc.) to extract and use during client vm preparation. """ raise NotImplementedError def ExecuteQuery( self, query_name: str, print_results: bool = False ) -> tuple[float, dict[str, Any]]: """Executes a query and returns performance details. Args: query_name: String name of the query to execute print_results: Whether to include query results in execution details. Returns: A tuple of (execution_time, execution details) execution_time: A Float variable set to the query's completion time in secs. -1.0 is used as a sentinel value implying the query failed. For a successful query the value is expected to be positive. performance_details: A dictionary of query execution attributes eg. job_id """ raise NotImplementedError def ExecuteSimultaneous( self, submission_interval: int, queries: List[str] ) -> str: """Executes queries simultaneously on client and return performance details. Simultaneous app expects queries as white space separated query file names. Response format: {"simultaneous_end":1601145943197,"simultaneous_start":1601145940113, "stream_performance_array":[{"query_wall_time_in_secs":2.079, "query_end":1601145942208,"job_id":"914682d9-4f64-4323-bad2-554267cbbd8d", "query":"1","query_start":1601145940129},{"query_wall_time_in_secs":2.572, "query_end":1601145943192,"job_id":"efbf93a1-614c-4645-a268-e3801ae994f1", "query":"2","query_start":1601145940620}], "simultaneous_wall_time_in_secs":3.084} Args: submission_interval: Simultaneous query submission interval in milliseconds. queries: List of string names of the queries to execute simultaneously. Returns: performance_details: A serialized dictionary of execution details. """ raise NotImplementedError def ExecuteThroughput( self, concurrency_streams: List[List[str]], labels: dict[str, str] | None = None, ) -> str: """Executes a throughput test and returns performance details. Response format: {"throughput_start":1601666911596,"throughput_end":1601666916139, "throughput_wall_time_in_secs":4.543, "all_streams_performance_array":[ {"stream_start":1601666911597,"stream_end":1601666916139, "stream_wall_time_in_secs":4.542, "stream_performance_array":[ {"query_wall_time_in_secs":2.238,"query_end":1601666913849, "query":"1","query_start":1601666911611, "details":{"job_id":"438170b0-b0cb-4185-b733-94dd05b46b05"}}, {"query_wall_time_in_secs":2.285,"query_end":1601666916139, "query":"2","query_start":1601666913854, "details":{"job_id":"371902c7-5964-46f6-9f90-1dd00137d0c8"}} ]}, {"stream_start":1601666911597,"stream_end":1601666916018, "stream_wall_time_in_secs":4.421, "stream_performance_array":[ {"query_wall_time_in_secs":2.552,"query_end":1601666914163, "query":"2","query_start":1601666911611, "details":{"job_id":"5dcba418-d1a2-4a73-be70-acc20c1f03e6"}}, {"query_wall_time_in_secs":1.855,"query_end":1601666916018, "query":"1","query_start":1601666914163, "details":{"job_id":"568c4526-ae26-4e9d-842c-03459c3a216d"}} ]} ]} Args: concurrency_streams: List of streams to execute simultaneously, each of which is a list of string names of queries. labels: A dictionary of labels to apply to the query for service-side tracking and analysis. Must include the mandatory label 'minimal_run_key', which should be unique per individual iteration. Returns: A serialized dictionary of execution details. """ raise NotImplementedError def WarmUpQuery(self): """Executes a service-agnostic query that can detect cold start issues.""" if self.client_vm is None: raise ValueError('SetProvisionedAttributes() not called') with open(SAMPLE_QUERY_PATH, 'w+') as f: f.write(SAMPLE_QUERY) self.client_vm.PushFile(SAMPLE_QUERY_PATH) query_name = os.path.basename(SAMPLE_QUERY_PATH) self.ExecuteQuery(query_name) def GetMetadata(self) -> Dict[str, str]: """Returns the client interface metadata.""" raise NotImplementedError class EdwService(resource.BaseResource): """Object representing a EDW Service.""" SERVICE_TYPE = 'abstract' def __init__(self, edw_service_spec): """Initialize the edw service object. Args: edw_service_spec: spec of the edw service. """ # Hand over the actual creation to the resource module, which assumes the # resource is pkb managed by default is_user_managed = self.IsUserManaged(edw_service_spec) # edw_service attribute self.cluster_identifier = self.GetClusterIdentifier(edw_service_spec) super().__init__(user_managed=is_user_managed) # Provision related attributes if edw_service_spec.snapshot: self.snapshot = edw_service_spec.snapshot else: self.snapshot = None # Cluster related attributes self.concurrency = edw_service_spec.concurrency self.node_type = edw_service_spec.node_type if edw_service_spec.node_count: self.node_count = edw_service_spec.node_count else: self.node_count = DEFAULT_NUMBER_OF_NODES # Interaction related attributes if edw_service_spec.endpoint: self.endpoint = edw_service_spec.endpoint else: self.endpoint = '' self.db = edw_service_spec.db self.user = edw_service_spec.user self.password = edw_service_spec.password # resource config attribute self.spec = edw_service_spec # resource workflow management self.supports_wait_on_delete = True self.client_interface: EdwClientInterface def GetClientInterface(self) -> EdwClientInterface: """Gets the active Client Interface.""" return self.client_interface def IsUserManaged(self, edw_service_spec): """Indicates if the edw service instance is user managed. Args: edw_service_spec: spec of the edw service. Returns: A boolean, set to True if the edw service instance is user managed, False otherwise. """ return edw_service_spec.cluster_identifier is not None def GetClusterIdentifier(self, edw_service_spec): """Returns a string name of the Cluster Identifier. Args: edw_service_spec: spec of the edw service. Returns: A string, set to the name of the cluster identifier. """ if self.IsUserManaged(edw_service_spec): return edw_service_spec.cluster_identifier else: return 'pkb-' + FLAGS.run_uri def GetMetadata(self): """Return a dictionary of the metadata for this edw service.""" basic_data = { 'edw_service_type': self.spec.type, 'edw_cluster_identifier': self.cluster_identifier, 'edw_cluster_node_type': self.node_type, 'edw_cluster_node_count': self.node_count, } return basic_data def GenerateLifecycleStageScriptName(self, lifecycle_stage): """Computes the default name for script implementing an edw lifecycle stage. Args: lifecycle_stage: Stage for which the corresponding sql script is desired. Returns: script name for implementing the argument lifecycle_stage. """ return os.path.basename( os.path.normpath('database_%s.sql' % lifecycle_stage) ) def Cleanup(self): """Cleans up any temporary resources created for the service.""" pass def GetDatasetLastUpdatedTime(self, dataset=None): """Get the formatted last modified timestamp of the dataset.""" raise NotImplementedError def SetDestinationTable(self, dataset: str): pass def ExtractDataset( self, dest_bucket, dataset=None, tables=None, dest_format='CSV' ): """Extract all tables in a dataset to object storage. Args: dest_bucket: Name of the bucket to extract the data to. Should already exist. dataset: Optional name of the dataset. If none, will be determined by the service. tables: Optional list of table names to extract. If none, all tables in the dataset will be extracted. dest_format: Format to extract data in. """ raise NotImplementedError def RemoveDataset(self, dataset=None): """Removes a dataset. Args: dataset: Optional name of the dataset. If none, will be determined by the service. """ raise NotImplementedError def CreateDataset(self, dataset=None, description=None): """Creates a new dataset. Args: dataset: Optional name of the dataset. If none, will be determined by the service. description: Optional description of the dataset. """ raise NotImplementedError def LoadDataset(self, source_bucket, tables, dataset=None): """Load all tables in a dataset to a database from object storage. Args: source_bucket: Name of the bucket to load the data from. Should already exist. Each table must have its own subfolder in the bucket named after the table, containing one or more csv files that make up the table data. tables: List of table names to load. dataset: Optional name of the dataset. If none, will be determined by the service. """ raise NotImplementedError def OpenDataset(self, dataset: str): """Switch from the currently active dataset to the one specified. Switches the dataset that will be accessed by queries sent through the client interface that this EDW service provides. Args: dataset: Name of the dataset to make the active dataset. """ raise NotImplementedError def CopyTable(self, copy_table_name: str, to_dataset: str) -> None: """Copy a table from the active dataset to the specified dataset. Copies a table between datasets, from the active (current) dataset to another named dataset in the same project. Args: copy_table_name: Name of the table to copy from the loaded dataset to the copy dataset. to_dataset: Name of the dataset to copy the table into. """ raise NotImplementedError def RequiresWarmUpSuite(self) -> bool: """Verifies if the edw_service requires a warm up suite execution. Currently enabled for all service types, for parity. Returns: A boolean value (True) if the warm suite is recommended. """ return True def GetIterationAuxiliaryMetrics(self, iter_run_key: str) -> Dict[str, Any]: """Returns service-specific metrics derived from server-side metadata. Must be run after the benchmark has completed. Args: iter_run_key: The unique identifier of the run and iteration to fetch metrics for. Returns: A dictionary of the following format: { 'metric_1': { 'value': 1, 'unit': 'imperial femtoseconds' }, 'metric_2': { 'value': 2, 'unit': 'metric dollars' } ...} """ raise NotImplementedError

perfkitbenchmarker/edw_service.py (283 lines of code) (raw):