in elastic/shared/track_processors/data_generator.py [0:0]
def on_prepare_track(self, track, data_root_dir):
if not track.selected_challenge_or_default.parameters.get("generate-data", True):
return []
track_data_root = os.path.join(data_root_dir, track.name)
for corpus in track.corpora:
if not corpus.meta_data.get("generated", False):
data_root = os.path.join(track_data_root, corpus.name)
self.logger.info(
"Resolved data root directory for document corpus [%s] in track [%s] to [%s].",
corpus.name,
track.name,
data_root,
)
# only set for real benchmarks, not in unit tests
if self.downloader and self.decompressor:
prep = DocumentSetPreparator(track.name, self.downloader, self.decompressor)
for document_set in corpus.documents:
prep.prepare_document_set(document_set, data_root)
# data is now available locally, proceed with generating data
client_count = track.selected_challenge_or_default.parameters.get("data-generation-clients", 2)
track_id = track.selected_challenge_or_default.parameters["track-id"]
track.selected_challenge_or_default.parameters["output-folder"] = os.path.join(track_data_root, "generated", track_id)
retval = []
for client_id in range(client_count):
generator_params = {
"track": track,
"track_data_root": track_data_root,
"client_index": client_id,
"client_count": client_count,
}
retval.append((generate, generator_params))
return retval