in elastic/shared/track_processors/data_generator.py [0:0]
def _init_internal_params(self):
# avoid zero seeds because of client_index == 0
seed = (self._client_index + 1) * self._random_seed if self._random_seed else None
random.seed(seed)
self.logger.info(
"Initializing generator [%d/%d] with seed [%d].",
self._client_index,
self._client_count,
seed,
)
self.readers = self._create_readers(self._client_count, self._client_index)
corpus_stats = self._sample_corpus_stats()
# we will be sampling our corpora based on required doc ratios to satisfy the total gb.
# Larger corpus need a smaller ratio of lines to satisfy the original user specified ratios in gb
corpora_ratios = {
corpus_name: ratio
for integration_name, integration in self._integration_ratios.items()
for corpus_name, ratio in integration["corpora"].items()
}
corpora_doc_counts = calculate_corpus_counts(
corpus_stats,
corpora_ratios,
self._data_generation_gb,
self._max_generation_size_gb,
)
self._corpora_doc_ratios = calculate_integration_ratios(corpora_doc_counts)
self.total_docs = sum(corpora_doc_counts.values())
if self._client_index == 0:
self.logger.info("Total Docs: [%s]", self.total_docs)
self.logger.info("Corpora Counts: [%s]", json.dumps(corpora_doc_counts))
self.logger.info("Corpora Ratios: [%s]", json.dumps(self._corpora_doc_ratios))
# last client gets a little more from bounds function
_, self.docs_per_client = bounds(self.total_docs, self._client_index, self._client_count)