esrally/track/params.py

# Licensed to Elasticsearch B.V. under one or more contributor # license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright # ownership. Elasticsearch B.V. licenses this file to you under # the Apache License, Version 2.0 (the "License"); you may # not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from __future__ import annotations import collections import inspect import logging import math import numbers import operator import random import time from abc import ABC from enum import Enum from typing import Callable, Deque from esrally import exceptions from esrally.track import track from esrally.utils import io __PARAM_SOURCES_BY_OP: dict[track.OperationType, ParamSource] = {} __PARAM_SOURCES_BY_NAME: dict[str, ParamSource] = {} def param_source_for_operation(op_type, track, params, task_name): try: # we know that this can only be a Rally core parameter source return __PARAM_SOURCES_BY_OP[op_type](track, params, operation_name=task_name) except KeyError: return ParamSource(track, params, operation_name=task_name) def param_source_for_name(name, track, params): param_source = __PARAM_SOURCES_BY_NAME[name] if inspect.isfunction(param_source): return DelegatingParamSource(track, params, param_source) else: return param_source(track, params) def ensure_valid_param_source(param_source): if not inspect.isfunction(param_source) and not inspect.isclass(param_source): raise exceptions.RallyAssertionError(f"Parameter source [{param_source}] must be either a function or a class.") def register_param_source_for_operation(op_type, param_source_class): ensure_valid_param_source(param_source_class) __PARAM_SOURCES_BY_OP[op_type.to_hyphenated_string()] = param_source_class def register_param_source_for_name(name, param_source_class): ensure_valid_param_source(param_source_class) __PARAM_SOURCES_BY_NAME[name] = param_source_class # only intended for tests def _unregister_param_source_for_name(name): # We intentionally do not specify a default value if the key does not exist. If we try to remove a key that we didn't insert then # something is fishy with the test and we'd rather know early. __PARAM_SOURCES_BY_NAME.pop(name) # Default class ParamSource: """ A `ParamSource` captures the parameters for a given operation. Rally will create one global ParamSource for each operation and will then invoke `#partition()` to get a `ParamSource` instance for each client. During the benchmark, `#params()` will be called repeatedly before Rally invokes the corresponding runner (that will actually execute the operation against Elasticsearch). """ def __init__(self, track, params, **kwargs): """ Creates a new ParamSource instance. :param track: The current track definition :param params: A hash of all parameters that have been extracted for this operation. """ self.track = track self._params = params self.kwargs = kwargs def partition(self, partition_index, total_partitions): """ This method will be invoked by Rally at the beginning of the lifecycle. It splits a parameter source per client. If the corresponding operation is idempotent, return `self` (e.g. for queries). If the corresponding operation has side-effects and it matters which client executes which part (e.g. an index operation from a source file), return the relevant part. Do NOT assume that you can share state between ParamSource objects in different partitions (technical explanation: each client will be a dedicated process, so each object of a `ParamSource` lives in its own process and hence cannot share state with other instances). :param partition_index: The current partition for which a parameter source is needed. It is in the range [0, `total_partitions`). :param total_partitions: The total number of partitions (i.e. clients). :return: A parameter source for the current partition. """ return self @property def infinite(self): # for bwc return self.size() is None # Deprecated def size(self): """ Rally has two modes in which it can run: * It will either run an operation for a pre-determined number of times or * It can run until the parameter source is exhausted. In the former case, you should determine the number of times that `#params()` will be invoked. With that number, Rally can show the progress made so far to the user. In the latter case, return ``None``. :return: The "size" of this parameter source or ``None`` if should run eternally. """ return None def params(self): """ :return: A hash containing the parameters that will be provided to the corresponding operation runner (key: parameter name, value: parameter value). """ return self._params def _client_params(self): """ For use when a ParamSource does not propagate self._params but does use elasticsearch client under the hood :return: all applicable parameters that are global to Rally and apply to the elasticsearch-py client """ return { "request-timeout": self._params.get("request-timeout"), "headers": self._params.get("headers"), "opaque-id": self._params.get("opaque-id"), } class DelegatingParamSource(ParamSource): def __init__(self, track, params, delegate, **kwargs): super().__init__(track, params, **kwargs) self.delegate = delegate def params(self): return self.delegate(self.track, self._params, **self.kwargs) class SleepParamSource(ParamSource): def __init__(self, track, params, **kwargs): super().__init__(track, params, **kwargs) try: duration = params["duration"] except KeyError: raise exceptions.InvalidSyntax("parameter 'duration' is mandatory for sleep operation") if not isinstance(duration, numbers.Number): raise exceptions.InvalidSyntax("parameter 'duration' for sleep operation must be a number") if duration < 0: raise exceptions.InvalidSyntax(f"parameter 'duration' must be non-negative but was {duration}") def params(self): return dict(self._params) class CreateIndexParamSource(ParamSource): def __init__(self, track, params, **kwargs): super().__init__(track, params, **kwargs) self.request_params = params.get("request-params", {}) self.index_definitions = [] if track.indices: filter_idx = params.get("index") if isinstance(filter_idx, str): filter_idx = [filter_idx] settings = params.get("settings") for idx in track.indices: if not filter_idx or idx.name in filter_idx: body = idx.body if body and settings: if "settings" in body: # merge (and potentially override) body["settings"].update(settings) else: body["settings"] = settings elif not body and settings: body = {"settings": settings} self.index_definitions.append((idx.name, body)) else: try: # only 'index' is mandatory, the body is optional (may be ok to create an index without a body) idx = params["index"] body = params.get("body") if isinstance(idx, str): idx = [idx] for i in idx: self.index_definitions.append((i, body)) except KeyError: raise exceptions.InvalidSyntax("Please set the property 'index' for the create-index operation") def params(self): p = {} # ensure we pass all parameters... p.update(self._params) p.update( { "indices": self.index_definitions, "request-params": self.request_params, } ) return p class CreateDataStreamParamSource(ParamSource): def __init__(self, track, params, **kwargs): super().__init__(track, params, **kwargs) self.request_params = params.get("request-params", {}) self.data_stream_definitions = [] if track.data_streams: filter_ds = params.get("data-stream") if isinstance(filter_ds, str): filter_ds = [filter_ds] for ds in track.data_streams: if not filter_ds or ds.name in filter_ds: self.data_stream_definitions.append(ds.name) else: try: data_stream = params["data-stream"] data_streams = [data_stream] if isinstance(data_stream, str) else data_stream for ds in data_streams: self.data_stream_definitions.append(ds) except KeyError: raise exceptions.InvalidSyntax("Please set the property 'data-stream' for the create-data-stream operation") def params(self): p = {} # ensure we pass all parameters... p.update(self._params) p.update( { "data-streams": self.data_stream_definitions, "request-params": self.request_params, } ) return p class DeleteDataStreamParamSource(ParamSource): def __init__(self, track, params, **kwargs): super().__init__(track, params, **kwargs) self.request_params = params.get("request-params", {}) self.only_if_exists = params.get("only-if-exists", True) self.data_stream_definitions = [] target_data_stream = params.get("data-stream") if target_data_stream: target_data_stream = [target_data_stream] if isinstance(target_data_stream, str) else target_data_stream for ds in target_data_stream: self.data_stream_definitions.append(ds) elif track.data_streams: for ds in track.data_streams: self.data_stream_definitions.append(ds.name) else: raise exceptions.InvalidSyntax("delete-data-stream operation targets no data stream") def params(self): p = {} # ensure we pass all parameters... p.update(self._params) p.update( {"data-streams": self.data_stream_definitions, "request-params": self.request_params, "only-if-exists": self.only_if_exists} ) return p class DeleteIndexParamSource(ParamSource): def __init__(self, track, params, **kwargs): super().__init__(track, params, **kwargs) self.request_params = params.get("request-params", {}) self.only_if_exists = params.get("only-if-exists", True) self.index_definitions = [] target_index = params.get("index") if target_index: if isinstance(target_index, str): target_index = [target_index] for idx in target_index: self.index_definitions.append(idx) elif track.indices: for idx in track.indices: self.index_definitions.append(idx.name) else: raise exceptions.InvalidSyntax("delete-index operation targets no index") def params(self): p = {} # ensure we pass all parameters... p.update(self._params) p.update( { "indices": self.index_definitions, "request-params": self.request_params, "only-if-exists": self.only_if_exists, } ) return p class CreateIndexTemplateParamSource(ParamSource): def __init__(self, track, params, **kwargs): super().__init__(track, params, **kwargs) self.request_params = params.get("request-params", {}) self.template_definitions = [] if track.templates: filter_template = params.get("template") settings = params.get("settings") for template in track.templates: if not filter_template or template.name == filter_template: body = template.content if body and settings: if "settings" in body: # merge (and potentially override) body["settings"].update(settings) else: body["settings"] = settings self.template_definitions.append((template.name, body)) else: try: self.template_definitions.append((params["template"], params["body"])) except KeyError: raise exceptions.InvalidSyntax("Please set the properties 'template' and 'body' for the create-index-template operation") def params(self): p = {} # ensure we pass all parameters... p.update(self._params) p.update( { "templates": self.template_definitions, "request-params": self.request_params, } ) return p class DeleteTemplateParamSource(ABC, ParamSource): def __init__(self, track, params, templates, **kwargs): super().__init__(track, params, **kwargs) self.only_if_exists = params.get("only-if-exists", True) self.request_params = params.get("request-params", {}) self.template_definitions = [] if templates: filter_template = params.get("template") for template in templates: if not filter_template or template.name == filter_template: self.template_definitions.append((template.name, template.delete_matching_indices, template.pattern)) else: try: template = params["template"] except KeyError: raise exceptions.InvalidSyntax(f"Please set the property 'template' for the {params.get('operation-type')} operation") delete_matching = params.get("delete-matching-indices", False) try: index_pattern = params["index-pattern"] if delete_matching else None except KeyError: raise exceptions.InvalidSyntax( "The property 'index-pattern' is required for delete-index-template if 'delete-matching-indices' is true." ) self.template_definitions.append((template, delete_matching, index_pattern)) def params(self): p = {} # ensure we pass all parameters... p.update(self._params) p.update( { "templates": self.template_definitions, "only-if-exists": self.only_if_exists, "request-params": self.request_params, } ) return p class DeleteIndexTemplateParamSource(DeleteTemplateParamSource): def __init__(self, track, params, **kwargs): super().__init__(track, params, track.templates, **kwargs) class DeleteComposableTemplateParamSource(DeleteTemplateParamSource): def __init__(self, track, params, **kwargs): super().__init__(track, params, track.composable_templates, **kwargs) class DeleteComponentTemplateParamSource(ParamSource): def __init__(self, track, params, **kwargs): super().__init__(track, params, **kwargs) self.only_if_exists = params.get("only-if-exists", True) self.request_params = params.get("request-params", {}) self.template_definitions = [] if track.component_templates: filter_template = params.get("template") for template in track.component_templates: if not filter_template or template.name == filter_template: self.template_definitions.append(template.name) else: try: template = params["template"] self.template_definitions.append(template) except KeyError: raise exceptions.InvalidSyntax(f"Please set the property 'template' for the {params.get('operation-type')} operation.") def params(self): return { "templates": self.template_definitions, "only-if-exists": self.only_if_exists, "request-params": self.request_params, } class CreateTemplateParamSource(ABC, ParamSource): def __init__(self, track, params, templates, **kwargs): super().__init__(track, params, **kwargs) self.request_params = params.get("request-params", {}) self.template_definitions = [] if "template" in params and "body" in params: self.template_definitions.append((params["template"], params["body"])) elif templates: filter_template = params.get("template") settings = params.get("settings") template_definitions = [] for template in templates: if not filter_template or template.name == filter_template: body = self._create_or_merge(template.content, ["template", "settings"], settings) template_definitions.append((template.name, body)) if filter_template and not template_definitions: template_names = ", ".join([template.name for template in templates]) raise exceptions.InvalidSyntax(f"Unknown template: {filter_template}. Available templates: {template_names}.") self.template_definitions.extend(template_definitions) else: raise exceptions.InvalidSyntax( "Please set the properties 'template' and 'body' for the " f"{params.get('operation-type')} operation or declare composable and/or component " "templates in the track" ) @staticmethod def _create_or_merge(content, path, new_content): original_content = content if new_content: for sub_path in path: if sub_path not in content: content[sub_path] = {} content = content[sub_path] CreateTemplateParamSource.__merge(content, new_content) return original_content @staticmethod def __merge(dct, merge_dct): for k in merge_dct.keys(): if k in dct and isinstance(dct[k], dict) and isinstance(merge_dct[k], collections.abc.Mapping): CreateTemplateParamSource.__merge(dct[k], merge_dct[k]) else: dct[k] = merge_dct[k] def params(self): return { "templates": self.template_definitions, "request-params": self.request_params, } class CreateComposableTemplateParamSource(CreateTemplateParamSource): def __init__(self, track, params, **kwargs): super().__init__(track, params, track.composable_templates, **kwargs) class CreateComponentTemplateParamSource(CreateTemplateParamSource): def __init__(self, track, params, **kwargs): super().__init__(track, params, track.component_templates, **kwargs) class SearchParamSource(ParamSource): def __init__(self, track, params, **kwargs): super().__init__(track, params, **kwargs) target_name = get_target(track, params) type_name = params.get("type") if params.get("data-stream") and type_name: raise exceptions.InvalidSyntax(f"'type' not supported with 'data-stream' for operation '{kwargs.get('operation_name')}'") request_cache = params.get("cache", None) detailed_results = params.get("detailed-results", False) query_body = params.get("body", None) pages = params.get("pages", None) results_per_page = params.get("results-per-page", None) request_params = params.get("request-params", {}) response_compression_enabled = params.get("response-compression-enabled", True) with_point_in_time_from = params.get("with-point-in-time-from", None) self.query_params = { "index": target_name, "type": type_name, "cache": request_cache, "detailed-results": detailed_results, "request-params": request_params, "response-compression-enabled": response_compression_enabled, "body": query_body, } if not target_name: raise exceptions.InvalidSyntax( f"'index' or 'data-stream' is mandatory and is missing for operation '{kwargs.get('operation_name')}'" ) if pages: self.query_params["pages"] = pages if results_per_page: self.query_params["results-per-page"] = results_per_page if with_point_in_time_from: self.query_params["with-point-in-time-from"] = with_point_in_time_from if "assertions" in params: if not detailed_results: # for paginated queries the value does not matter because detailed results are always retrieved. is_paginated = bool(pages) if not is_paginated: raise exceptions.InvalidSyntax("The property [detailed-results] must be [true] if assertions are defined") self.query_params["assertions"] = params["assertions"] # Ensure we pass global parameters self.query_params.update(self._client_params()) def params(self): return self.query_params class IndexIdConflict(Enum): """ Determines which id conflicts to simulate during indexing. * NoConflicts: Produce no id conflicts * SequentialConflicts: A document id is replaced with a document id with a sequentially increasing id * RandomConflicts: A document id is replaced with a document id with a random other id Note that this assumes that each document in the benchmark corpus has an id between [1, size_of(corpus)] """ NoConflicts = 0 SequentialConflicts = 1 RandomConflicts = 2 class BulkIndexParamSource(ParamSource): def __init__(self, track, params, **kwargs): super().__init__(track, params, **kwargs) id_conflicts = params.get("conflicts", None) if not id_conflicts: self.id_conflicts = IndexIdConflict.NoConflicts elif id_conflicts == "sequential": self.id_conflicts = IndexIdConflict.SequentialConflicts elif id_conflicts == "random": self.id_conflicts = IndexIdConflict.RandomConflicts else: raise exceptions.InvalidSyntax("Unknown 'conflicts' setting [%s]" % id_conflicts) if "data-streams" in params and self.id_conflicts != IndexIdConflict.NoConflicts: raise exceptions.InvalidSyntax("'conflicts' cannot be used with 'data-streams'") if self.id_conflicts != IndexIdConflict.NoConflicts: self.conflict_probability = self.float_param( params, name="conflict-probability", default_value=25, min_value=0, max_value=100, min_operator=operator.lt ) self.on_conflict = params.get("on-conflict", "index") if self.on_conflict not in ["index", "update"]: raise exceptions.InvalidSyntax(f"Unknown 'on-conflict' setting [{self.on_conflict}]") self.recency = self.float_param(params, name="recency", default_value=0, min_value=0, max_value=1, min_operator=operator.lt) else: self.conflict_probability = None self.on_conflict = None self.recency = None self.corpora = self.used_corpora(track, params) if len(self.corpora) == 0: raise exceptions.InvalidSyntax( f"There is no document corpus definition for track {track}. You must add at " f"least one before making bulk requests to Elasticsearch." ) for corpus in self.corpora: for document_set in corpus.documents: if document_set.includes_action_and_meta_data and self.id_conflicts != IndexIdConflict.NoConflicts: file_name = document_set.document_archive if document_set.has_compressed_corpus() else document_set.document_file raise exceptions.InvalidSyntax( "Cannot generate id conflicts [%s] as [%s] in document corpus [%s] already contains an " "action and meta-data line." % (id_conflicts, file_name, corpus) ) self.pipeline = params.get("pipeline", None) try: self.bulk_size = int(params["bulk-size"]) if self.bulk_size <= 0: raise exceptions.InvalidSyntax("'bulk-size' must be positive but was %d" % self.bulk_size) except KeyError: raise exceptions.InvalidSyntax("Mandatory parameter 'bulk-size' is missing") except ValueError: raise exceptions.InvalidSyntax("'bulk-size' must be numeric") try: self.batch_size = int(params.get("batch-size", self.bulk_size)) if self.batch_size <= 0: raise exceptions.InvalidSyntax("'batch-size' must be positive but was %d" % self.batch_size) if self.batch_size < self.bulk_size: raise exceptions.InvalidSyntax("'batch-size' must be greater than or equal to 'bulk-size'") if self.batch_size % self.bulk_size != 0: raise exceptions.InvalidSyntax("'batch-size' must be a multiple of 'bulk-size'") except ValueError: raise exceptions.InvalidSyntax("'batch-size' must be numeric") self.ingest_percentage = self.float_param(params, name="ingest-percentage", default_value=100, min_value=0, max_value=100) self.refresh = params.get("refresh") self.looped = params.get("looped", False) self.param_source = PartitionBulkIndexParamSource( self.corpora, self.batch_size, self.bulk_size, self.ingest_percentage, self.id_conflicts, self.conflict_probability, self.on_conflict, self.recency, self.pipeline, self.refresh, self.looped, self._params, ) def float_param(self, params, name, default_value, min_value, max_value, min_operator=operator.le): try: value = float(params.get(name, default_value)) if min_operator(value, min_value) or value > max_value: interval_min = "(" if min_operator is operator.le else "[" raise exceptions.InvalidSyntax( f"'{name}' must be in the range {interval_min}{min_value:.1f}, {max_value:.1f}] but was {value:.1f}" ) return value except ValueError: raise exceptions.InvalidSyntax(f"'{name}' must be numeric") def used_corpora(self, t, params): corpora = [] track_corpora_names = [corpus.name for corpus in t.corpora] corpora_names = params.get("corpora", track_corpora_names) if isinstance(corpora_names, str): corpora_names = [corpora_names] for corpus in t.corpora: if corpus.name in corpora_names: filtered_corpus = corpus.filter( source_format=track.Documents.SOURCE_FORMAT_BULK, target_indices=params.get("indices"), target_data_streams=params.get("data-streams"), ) if filtered_corpus.number_of_documents(source_format=track.Documents.SOURCE_FORMAT_BULK) > 0: corpora.append(filtered_corpus) # the track has corpora but none of them match if t.corpora and not corpora: raise exceptions.RallyAssertionError( "The provided corpus %s does not match any of the corpora %s." % (corpora_names, track_corpora_names) ) return corpora def partition(self, partition_index, total_partitions): # register the new partition internally self.param_source.partition(partition_index, total_partitions) return self.param_source def params(self): raise exceptions.RallyError("Do not use a BulkIndexParamSource without partitioning") class PartitionBulkIndexParamSource: def __init__( self, corpora, batch_size, bulk_size, ingest_percentage, id_conflicts, conflict_probability, on_conflict, recency, pipeline=None, refresh=None, looped: bool = False, original_params=None, ): """ :param corpora: Specification of affected document corpora. :param batch_size: The number of documents to read in one go. :param bulk_size: The size of bulk index operations (number of documents per bulk). :param ingest_percentage: A number between (0.0, 100.0] that defines how much of the whole corpus should be ingested. :param id_conflicts: The type of id conflicts. :param conflict_probability: A number between (0.0, 100.0] that defines the probability that a document is replaced by another one. :param on_conflict: A string indicating which action should be taken on id conflicts (either "index" or "update"). :param recency: A number between [0.0, 1.0] indicating whether to bias generation of conflicting ids towards more recent ones. May be None. :param pipeline: The name of the ingest pipeline to run. :param refresh: Optional string values are "true", "wait_for", "false". If "true", Elasticsearch refreshes the affected shards in the background. If "wait_for", the client is blocked until Elasticsearch finishes the refresh operation. If "false", Elasticsearch will use the default refresh behavior. :param looped: Set to True for looped mode where bulk requests are repeated from the beginning when entire corpus was ingested. :param original_params: The original dict passed to the parent parameter source. """ self.corpora = corpora self.partitions = [] self.total_partitions = None self.batch_size = batch_size self.bulk_size = bulk_size self.ingest_percentage = ingest_percentage self.id_conflicts = id_conflicts self.conflict_probability = conflict_probability self.on_conflict = on_conflict self.recency = recency self.pipeline = pipeline self.refresh = refresh self.looped = looped self.original_params = original_params # this is only intended for unit-testing self.create_reader = original_params.pop("__create_reader", create_default_reader) self.current_bulk = 0 # use a value > 0 so percent_completed returns a sensible value self.total_bulks = 1 self.infinite = False def partition(self, partition_index, total_partitions): if self.total_partitions is None: self.total_partitions = total_partitions elif self.total_partitions != total_partitions: raise exceptions.RallyAssertionError( f"Total partitions is expected to be [{self.total_partitions}] but was [{total_partitions}]" ) self.partitions.append(partition_index) def params(self): if self.current_bulk == 0: self._init_internal_params() # self.internal_params always reads all files. This is necessary to ensure we terminate early in case # the user has specified ingest percentage. if self.current_bulk == self.total_bulks: # start from the beginning in looped mode, otherwise stop the run if self.looped: self.current_bulk = 0 self._init_internal_params() else: raise StopIteration() self.current_bulk += 1 return next(self.internal_params) def _init_internal_params(self): # contains a continuous range of client ids self.partitions = sorted(self.partitions) start_index = self.partitions[0] end_index = self.partitions[-1] self.internal_params = bulk_data_based( self.total_partitions, start_index, end_index, self.corpora, self.batch_size, self.bulk_size, self.id_conflicts, self.conflict_probability, self.on_conflict, self.recency, self.pipeline, self.original_params, self.create_reader, ) all_bulks = number_of_bulks(self.corpora, start_index, end_index, self.total_partitions, self.bulk_size) self.total_bulks = math.ceil((all_bulks * self.ingest_percentage) / 100) @property def percent_completed(self): return self.current_bulk / self.total_bulks class OpenPointInTimeParamSource(ParamSource): def __init__(self, track, params, **kwargs): super().__init__(track, params, **kwargs) target_name = get_target(track, params) self._index_name = target_name self._keep_alive = params.get("keep-alive") def params(self): parsed_params = {"index": self._index_name, "keep-alive": self._keep_alive} parsed_params.update(self._client_params()) return parsed_params class ClosePointInTimeParamSource(ParamSource): def __init__(self, track, params, **kwargs): super().__init__(track, params, **kwargs) self._pit_task_name = params.get("with-point-in-time-from") def params(self): parsed_params = {"with-point-in-time-from": self._pit_task_name} parsed_params.update(self._client_params()) return parsed_params class ForceMergeParamSource(ParamSource): def __init__(self, track, params, **kwargs): super().__init__(track, params, **kwargs) if len(track.indices) > 0 or len(track.data_streams) > 0: # force merge data streams and indices - API call is the same so treat as indices default_target = ",".join(map(str, track.indices + track.data_streams)) else: default_target = "_all" self._target_name = params.get("index") if not self._target_name: self._target_name = params.get("data-stream", default_target) self._max_num_segments = params.get("max-num-segments") self._poll_period = params.get("poll-period", 10) self._mode = params.get("mode", "blocking") def params(self): parsed_params = { "index": self._target_name, "max-num-segments": self._max_num_segments, "mode": self._mode, "poll-period": self._poll_period, } parsed_params.update(self._client_params()) return parsed_params class DownsampleParamSource(ParamSource): def __init__(self, track, params, **kwargs): super().__init__(track, params, **kwargs) self._fixed_interval = params.get("fixed-interval", "1h") params["index"] = params.get("source-index") self._source_index = get_target(track, params) self._target_index = params.get("target-index", f"{self._source_index}-{self._fixed_interval}") def params(self): parsed_params = {"fixed-interval": self._fixed_interval, "source-index": self._source_index, "target-index": self._target_index} parsed_params.update(self._client_params()) return parsed_params def get_target(track, params): if len(track.indices) == 1: default_target = track.indices[0].name elif len(track.data_streams) == 1: default_target = track.data_streams[0].name else: default_target = None # indices are preferred but data streams can also be queried the same way target_name = params.get("index") if not target_name: target_name = params.get("data-stream", default_target) return target_name def number_of_bulks(corpora, start_partition_index, end_partition_index, total_partitions, bulk_size): """ :return: The number of bulk operations that the given client will issue. """ bulks = 0 for corpus in corpora: for docs in corpus.documents: _, num_docs, _ = bounds( docs.number_of_documents, start_partition_index, end_partition_index, total_partitions, docs.includes_action_and_meta_data ) complete_bulks, rest = (num_docs // bulk_size, num_docs % bulk_size) bulks += complete_bulks if rest > 0: bulks += 1 return bulks def build_conflicting_ids(conflicts, docs_to_index, offset, shuffle=random.shuffle): if conflicts is None or conflicts == IndexIdConflict.NoConflicts: return None all_ids = [0] * docs_to_index for i in range(docs_to_index): # always consider the offset as each client will index its own range and we don't want uncontrolled conflicts across clients all_ids[i] = "%010d" % (offset + i) if conflicts == IndexIdConflict.RandomConflicts: shuffle(all_ids) return all_ids def chain(*iterables): """ Chains the given iterables similar to `itertools.chain` except that it also respects the context manager contract. :param iterables: A number of iterable that should be chained. :return: An iterable that will delegate to all provided iterables in turn. """ for it in filter(lambda x: x is not None, iterables): # execute within a context with it: yield from it def create_default_reader( docs, offset, num_lines, num_docs, batch_size, bulk_size, id_conflicts, conflict_probability, on_conflict, recency ): source = Slice(io.MmapSource, offset, num_lines) target = None use_create = False if docs.target_index: target = docs.target_index elif docs.target_data_stream: target = docs.target_data_stream use_create = True if id_conflicts != IndexIdConflict.NoConflicts: # can only create docs in data streams raise exceptions.RallyError("Conflicts cannot be generated with append only data streams") if docs.includes_action_and_meta_data: return SourceOnlyIndexDataReader(docs.document_file, batch_size, bulk_size, source, target, docs.target_type) else: am_handler = GenerateActionMetaData( target, docs.target_type, build_conflicting_ids(id_conflicts, num_docs, offset), conflict_probability, on_conflict, recency, use_create=use_create, ) return MetadataIndexDataReader(docs.document_file, batch_size, bulk_size, source, am_handler, target, docs.target_type) def create_readers( num_clients: int, start_client_index: int, end_client_index: int, corpora: list[track.DocumentCorpus], batch_size: int, bulk_size: int, id_conflicts: IndexIdConflict, conflict_probability: float, on_conflict: str, recency: str, create_reader: Callable[..., IndexDataReader], ) -> list[IndexDataReader]: """ Return a list of IndexDataReader instances to allow a range of clients to read their share of corpora. We're looking for better parallelism between corpora in indexing tasks in two ways: 1. By giving each client its own starting point in the list of corpora (using a modulus of the number of corpora listed and the number of the client). In a track with 2 corpora and 5 clients, clients 1, 3, and 5 would start with the first corpus and clients 2 and 4 would start with the second corpus. 2. By generating the IndexDataReader list round-robin among all files, instead of in order. If I'm the first client, I start with the first partition of the first file of the first corpus. Then I move on to the first partition of the first file of the second corpus, and so on. """ corpora_readers: list[Deque[IndexDataReader]] = [] total_readers = 0 # stagger which corpus each client starts with for better parallelism (see 1. above) start_corpora_id = start_client_index % len(corpora) reordered_corpora = corpora[start_corpora_id:] + corpora[:start_corpora_id] for corpus in reordered_corpora: reader_queue: Deque[IndexDataReader] = collections.deque() for docs in corpus.documents: offset, num_docs, num_lines = bounds( docs.number_of_documents, start_client_index, end_client_index, num_clients, docs.includes_action_and_meta_data ) if num_docs > 0: reader: IndexDataReader = create_reader( docs, offset, num_lines, num_docs, batch_size, bulk_size, id_conflicts, conflict_probability, on_conflict, recency ) reader_queue.append(reader) total_readers += 1 corpora_readers.append(reader_queue) # Stagger which files will be read (see 2. above) staggered_readers: list[IndexDataReader] = [] while total_readers > 0: for reader_queue in corpora_readers: # Since corpora don't necessarily contain the same number of documents, we # ignore already consumed queues if reader_queue: staggered_readers.append(reader_queue.popleft()) total_readers -= 1 return staggered_readers def bounds(total_docs, start_client_index, end_client_index, num_clients, includes_action_and_meta_data): """ Calculates the start offset and number of documents for a range of clients. :param total_docs: The total number of documents to index. :param start_client_index: The first client index. Must be in the range [0, `num_clients'). :param end_client_index: The last client index. Must be in the range [0, `num_clients'). :param num_clients: The total number of clients that will run bulk index operations. :param includes_action_and_meta_data: Whether the source file already includes the action and meta-data line. :return: A tuple containing: the start offset (in lines) for the document corpus, the number documents that the clients should index, and the number of lines that the clients should read. """ source_lines_per_doc = 2 if includes_action_and_meta_data else 1 docs_per_client = total_docs / num_clients start_offset_docs = round(docs_per_client * start_client_index) end_offset_docs = round(docs_per_client * (end_client_index + 1)) offset_lines = start_offset_docs * source_lines_per_doc docs = end_offset_docs - start_offset_docs lines = docs * source_lines_per_doc return offset_lines, docs, lines def bulk_generator(readers, pipeline, original_params): bulk_id = 0 for index, type, batch in readers: # each batch can contain of one or more bulks for docs_in_bulk, bulk in batch: bulk_id += 1 bulk_params = { "index": index, "type": type, # For our implementation it's always present. Either the original source file already contains this line or the generator # has added it. "action-metadata-present": True, "body": bulk, # This is not always equal to the bulk_size we get as parameter. The last bulk may be less than the bulk size. "bulk-size": docs_in_bulk, "unit": "docs", } if pipeline: bulk_params["pipeline"] = pipeline params = original_params.copy() params.update(bulk_params) yield params def bulk_data_based( num_clients, start_client_index, end_client_index, corpora, batch_size, bulk_size, id_conflicts, conflict_probability, on_conflict, recency, pipeline, original_params, create_reader=create_default_reader, ): """ Calculates the necessary schedule for bulk operations. :param num_clients: The total number of clients that will run the bulk operation. :param start_client_index: The first client for which we calculated the schedule. Must be in the range [0, `num_clients'). :param end_client_index: The last client for which we calculated the schedule. Must be in the range [0, `num_clients'). :param corpora: Specification of affected document corpora. :param batch_size: The number of documents to read in one go. :param bulk_size: The size of bulk index operations (number of documents per bulk). :param id_conflicts: The type of id conflicts to simulate. :param conflict_probability: A number between (0.0, 100.0] that defines the probability that a document is replaced by another one. :param on_conflict: A string indicating which action should be taken on id conflicts (either "index" or "update"). :param recency: A number between [0.0, 1.0] indicating whether to bias generation of conflicting ids towards more recent ones. May be None. :param pipeline: Name of the ingest pipeline to use. May be None. :param original_params: A dict of original parameters that were passed from the track. They will be merged into the returned parameters. :param create_reader: A function to create the index reader. By default a file based index reader will be created. This parameter is intended for testing only. :return: A generator for the bulk operations of the given client. """ readers = create_readers( num_clients, start_client_index, end_client_index, corpora, batch_size, bulk_size, id_conflicts, conflict_probability, on_conflict, recency, create_reader, ) return bulk_generator(chain(*readers), pipeline, original_params) class GenerateActionMetaData: RECENCY_SLOPE = 30 def __init__( self, index_name, type_name, conflicting_ids=None, conflict_probability=None, on_conflict=None, recency=None, rand=random.random, randint=random.randint, randexp=random.expovariate, use_create=False, ): if type_name: self.meta_data_index_with_id = '{"index": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % (index_name, type_name, "%s") self.meta_data_update_with_id = '{"update": {"_index": "%s", "_type": "%s", "_id": "%s"}}\n' % (index_name, type_name, "%s") self.meta_data_index_no_id = '{"index": {"_index": "%s", "_type": "%s"}}\n' % (index_name, type_name) else: self.meta_data_index_with_id = '{"index": {"_index": "%s", "_id": "%s"}}\n' % (index_name, "%s") self.meta_data_update_with_id = '{"update": {"_index": "%s", "_id": "%s"}}\n' % (index_name, "%s") self.meta_data_index_no_id = '{"index": {"_index": "%s"}}\n' % index_name self.meta_data_create_no_id = '{"create": {"_index": "%s"}}\n' % index_name if use_create and conflicting_ids: raise exceptions.RallyError("Index mode '_create' cannot be used with conflicting ids") self.conflicting_ids = conflicting_ids self.on_conflict = on_conflict self.use_create = use_create # random() produces numbers between 0 and 1 and the user denotes the probability in percentage between 0 and 100 self.conflict_probability = conflict_probability / 100.0 if conflict_probability is not None else 0 self.recency = recency if recency is not None else 0 self.rand = rand self.randint = randint self.randexp = randexp self.id_up_to = 0 @property def is_constant(self): """ :return: True iff the iterator will always return the same value. """ return self.conflicting_ids is None def __iter__(self): return self def __next__(self): if self.conflicting_ids is not None: if self.conflict_probability and self.id_up_to > 0 and self.rand() <= self.conflict_probability: # a recency of zero means that we don't care about recency and just take a random number # within the whole interval. if self.recency == 0: idx = self.randint(0, self.id_up_to - 1) else: # A recency > 0 biases id selection towards more recent ids. The recency parameter decides # by how much we bias. See docs for the resulting curve. # # idx_range is in the interval [0, 1]. idx_range = min(self.randexp(GenerateActionMetaData.RECENCY_SLOPE * self.recency), 1) # the resulting index is in the range [0, self.id_up_to). Note that a smaller idx_range # biases towards more recently used ids (higher indexes). idx = round((self.id_up_to - 1) * (1 - idx_range)) doc_id = self.conflicting_ids[idx] action = self.on_conflict else: if self.id_up_to >= len(self.conflicting_ids): raise StopIteration() doc_id = self.conflicting_ids[self.id_up_to] self.id_up_to += 1 action = "index" if action == "index": return "index", self.meta_data_index_with_id % doc_id elif action == "update": return "update", self.meta_data_update_with_id % doc_id else: raise exceptions.RallyAssertionError(f"Unknown action [{action}]") else: if self.use_create: return "create", self.meta_data_create_no_id return "index", self.meta_data_index_no_id class Slice: def __init__(self, source_class, offset, number_of_lines): self.source_class = source_class self.source = None self.offset = offset self.number_of_lines = number_of_lines self.current_line = 0 self.bulk_size = None self.logger = logging.getLogger(__name__) def open(self, file_name, mode, bulk_size): self.bulk_size = bulk_size self.source = self.source_class(file_name, mode).open() self.logger.info( "Will read [%d] lines from [%s] starting from line [%d] with bulk size [%d].", self.number_of_lines, file_name, self.offset, self.bulk_size, ) start = time.perf_counter() io.skip_lines(file_name, self.source, self.offset) end = time.perf_counter() self.logger.debug("Skipping [%d] lines took [%f] s.", self.offset, end - start) return self def close(self): self.source.close() self.source = None def __iter__(self): return self def __next__(self): if self.current_line >= self.number_of_lines: raise StopIteration() # ensure we don't read past the allowed number of lines. lines = self.source.readlines(min(self.bulk_size, self.number_of_lines - self.current_line)) self.current_line += len(lines) if len(lines) == 0: raise StopIteration() return lines def __str__(self): return "%s[%d;%d]" % (self.source, self.offset, self.offset + self.number_of_lines) class IndexDataReader: """ Reads a file in bulks into an array and also adds a meta-data line before each document if necessary. This implementation also supports batching. This means that you can specify batch_size = N * bulk_size, where N is any natural number >= 1. This makes file reading more efficient for small bulk sizes. """ def __init__(self, data_file, batch_size, bulk_size, file_source, index_name, type_name): self.data_file = data_file self.batch_size = batch_size self.bulk_size = bulk_size self.file_source = file_source self.index_name = index_name self.type_name = type_name def __enter__(self): self.file_source.open(self.data_file, "rt", self.bulk_size) return self def __iter__(self): return self def __next__(self): """ Returns lines for N bulk requests (where N is bulk_size / batch_size) """ batch = [] try: docs_in_batch = 0 while docs_in_batch < self.batch_size: try: docs_in_bulk, bulk = self.read_bulk() except StopIteration: break if docs_in_bulk == 0: break docs_in_batch += docs_in_bulk batch.append((docs_in_bulk, b"".join(bulk))) if docs_in_batch == 0: raise StopIteration() return self.index_name, self.type_name, batch except OSError: logging.getLogger(__name__).exception("Could not read [%s]", self.data_file) def __exit__(self, exc_type, exc_val, exc_tb): self.file_source.close() return False class MetadataIndexDataReader(IndexDataReader): def __init__(self, data_file, batch_size, bulk_size, file_source, action_metadata, index_name, type_name): super().__init__(data_file, batch_size, bulk_size, file_source, index_name, type_name) self.action_metadata = action_metadata self.action_metadata_line = None def __enter__(self): super().__enter__() if self.action_metadata.is_constant: _, self.action_metadata_line = next(self.action_metadata) self.read_bulk = self._read_bulk_fast else: self.read_bulk = self._read_bulk_regular return self def _read_bulk_fast(self): """ Special-case implementation for bulk data files where the action and meta-data line is always identical. """ current_bulk = [] # hoist action_metadata_line = self.action_metadata_line.encode("utf-8") docs = next(self.file_source) for doc in docs: current_bulk.append(action_metadata_line) current_bulk.append(doc) return len(docs), current_bulk def _read_bulk_regular(self): """ General case implementation for bulk files. This implementation can cover all cases but is slower when the action and meta-data line is always identical. """ current_bulk = [] docs = next(self.file_source) for doc in docs: action_metadata_item = next(self.action_metadata) if action_metadata_item: action_type, action_metadata_line = action_metadata_item current_bulk.append(action_metadata_line.encode("utf-8")) if action_type == "update": # remove the trailing "\n" as the doc needs to fit on one line doc = doc.strip() current_bulk.append(b'{"doc":%s}\n' % doc) else: current_bulk.append(doc) else: current_bulk.append(doc) return len(docs), current_bulk class SourceOnlyIndexDataReader(IndexDataReader): def __init__(self, data_file, batch_size, bulk_size, file_source, index_name, type_name): # keep batch size as it only considers documents read, not lines read but increase the bulk size as # documents are only on every other line. super().__init__(data_file, batch_size, bulk_size * 2, file_source, index_name, type_name) def read_bulk(self): bulk_items = next(self.file_source) return len(bulk_items) // 2, bulk_items register_param_source_for_operation(track.OperationType.Bulk, BulkIndexParamSource) register_param_source_for_operation(track.OperationType.Search, SearchParamSource) register_param_source_for_operation(track.OperationType.ScrollSearch, SearchParamSource) register_param_source_for_operation(track.OperationType.PaginatedSearch, SearchParamSource) register_param_source_for_operation(track.OperationType.CompositeAgg, SearchParamSource) register_param_source_for_operation(track.OperationType.CreateIndex, CreateIndexParamSource) register_param_source_for_operation(track.OperationType.DeleteIndex, DeleteIndexParamSource) register_param_source_for_operation(track.OperationType.CreateDataStream, CreateDataStreamParamSource) register_param_source_for_operation(track.OperationType.DeleteDataStream, DeleteDataStreamParamSource) register_param_source_for_operation(track.OperationType.CreateIndexTemplate, CreateIndexTemplateParamSource) register_param_source_for_operation(track.OperationType.DeleteIndexTemplate, DeleteIndexTemplateParamSource) register_param_source_for_operation(track.OperationType.CreateComponentTemplate, CreateComponentTemplateParamSource) register_param_source_for_operation(track.OperationType.DeleteComponentTemplate, DeleteComponentTemplateParamSource) register_param_source_for_operation(track.OperationType.CreateComposableTemplate, CreateComposableTemplateParamSource) register_param_source_for_operation(track.OperationType.DeleteComposableTemplate, DeleteComposableTemplateParamSource) register_param_source_for_operation(track.OperationType.Sleep, SleepParamSource) register_param_source_for_operation(track.OperationType.ForceMerge, ForceMergeParamSource) register_param_source_for_operation(track.OperationType.Downsample, DownsampleParamSource) # Also register by name, so users can use it too register_param_source_for_name("file-reader", BulkIndexParamSource)

esrally/track/params.py (992 lines of code) (raw):