services/worker/src/worker/main.py [29:67]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
        init_logging(level=app_config.log.level)
        # ^ set first to have logs as soon as possible
        parquet_metadata_directory = init_parquet_metadata_dir(directory=app_config.parquet_metadata.storage_directory)
        statistics_cache_directory = init_statistics_cache_dir(app_config.descriptive_statistics.cache_directory)

        storage_client = StorageClient(
            protocol=app_config.assets.storage_protocol,
            storage_root=app_config.assets.storage_root,
            base_url=app_config.assets.base_url,
            overwrite=True,  # all the job runners will overwrite the files
            s3_config=app_config.s3,
            # no need to specify cloudfront config here, as we are not generating signed urls in cached entries
        )

        with (
            LibrariesResource(
                hf_endpoint=app_config.common.hf_endpoint,
                init_hf_datasets_cache=app_config.datasets_based.hf_datasets_cache,
                numba_path=app_config.numba.path,
            ) as libraries_resource,
            CacheMongoResource(
                database=app_config.cache.mongo_database, host=app_config.cache.mongo_url
            ) as cache_resource,
            QueueMongoResource(
                database=app_config.queue.mongo_database, host=app_config.queue.mongo_url
            ) as queue_resource,
        ):
            if not cache_resource.is_available():
                raise RuntimeError("The connection to the cache database could not be established. Exiting.")
            if not queue_resource.is_available():
                raise RuntimeError("The connection to the queue database could not be established. Exiting.")

            job_runner_factory = JobRunnerFactory(
                app_config=app_config,
                hf_datasets_cache=libraries_resource.hf_datasets_cache,
                parquet_metadata_directory=parquet_metadata_directory,
                statistics_cache_directory=statistics_cache_directory,
                storage_client=storage_client,
            )
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



services/worker/src/worker/start_worker_loop.py [28:66]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    init_logging(level=app_config.log.level)
    # ^ set first to have logs as soon as possible
    parquet_metadata_directory = init_parquet_metadata_dir(directory=app_config.parquet_metadata.storage_directory)
    statistics_cache_directory = init_statistics_cache_dir(app_config.descriptive_statistics.cache_directory)

    storage_client = StorageClient(
        protocol=app_config.assets.storage_protocol,
        storage_root=app_config.assets.storage_root,
        base_url=app_config.assets.base_url,
        overwrite=True,  # all the job runners will overwrite the files
        s3_config=app_config.s3,
        # no need to specify cloudfront config here, as we are not generating signed urls in cached entries
    )

    with (
        LibrariesResource(
            hf_endpoint=app_config.common.hf_endpoint,
            init_hf_datasets_cache=app_config.datasets_based.hf_datasets_cache,
            numba_path=app_config.numba.path,
        ) as libraries_resource,
        CacheMongoResource(
            database=app_config.cache.mongo_database, host=app_config.cache.mongo_url
        ) as cache_resource,
        QueueMongoResource(
            database=app_config.queue.mongo_database, host=app_config.queue.mongo_url
        ) as queue_resource,
    ):
        if not cache_resource.is_available():
            raise RuntimeError("The connection to the cache database could not be established. Exiting.")
        if not queue_resource.is_available():
            raise RuntimeError("The connection to the queue database could not be established. Exiting.")

        job_runner_factory = JobRunnerFactory(
            app_config=app_config,
            hf_datasets_cache=libraries_resource.hf_datasets_cache,
            parquet_metadata_directory=parquet_metadata_directory,
            statistics_cache_directory=statistics_cache_directory,
            storage_client=storage_client,
        )
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



