in esrally/track/loader.py [0:0]
def prepare_document_set(self, document_set, data_root):
"""
Prepares a document set locally.
Precondition: The document set contains either a compressed or an uncompressed document file reference.
Postcondition: Either following files will be present locally:
* The compressed document file (if specified originally in the corpus)
* The uncompressed document file
* A file offset table based on the document file
Or this method will raise an appropriate Exception (download error, inappropriate specification of files, ...).
:param document_set: A document set.
:param data_root: The data root directory for this document set.
"""
doc_path = os.path.join(data_root, document_set.document_file)
archive_path = os.path.join(data_root, document_set.document_archive) if document_set.has_compressed_corpus() else None
while True:
if self.is_locally_available(doc_path) and self.has_expected_size(doc_path, document_set.uncompressed_size_in_bytes):
break
if (
document_set.has_compressed_corpus()
and self.is_locally_available(archive_path)
and self.has_expected_size(archive_path, document_set.compressed_size_in_bytes)
):
self.decompressor.decompress(archive_path, doc_path, document_set.uncompressed_size_in_bytes)
else:
if document_set.has_compressed_corpus():
target_path = archive_path
expected_size = document_set.compressed_size_in_bytes
elif document_set.has_uncompressed_corpus():
target_path = doc_path
expected_size = document_set.uncompressed_size_in_bytes
else:
# this should not happen in practice as the JSON schema should take care of this
raise exceptions.RallyAssertionError(f"Track {self.track_name} specifies documents but no corpus")
try:
self.downloader.download(document_set.base_url, target_path, expected_size)
except exceptions.DataError as e:
if e.message == "Cannot download data because no base URL is provided." and self.is_locally_available(target_path):
raise exceptions.DataError(
f"[{target_path}] is present but does not have the expected "
f"size of [{expected_size}] bytes and it cannot be downloaded "
f"because no base URL is provided."
) from None
raise
self.create_file_offset_table(doc_path, document_set.number_of_lines)