build_obelics/01_download_warc.py [66:86]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    if ("warc" not in metadata_dataset.column_names) and ("warc_error" not in metadata_dataset.column_names):
        metadata_dataset = metadata_dataset.add_column("warc", [b""] * len(metadata_dataset))
        metadata_dataset = metadata_dataset.add_column("warc_error", [""] * len(metadata_dataset))
    logger.info("Finished loading the metadata or previous warc dataset")

    warc_downloader = WarcDownloader()
    logger.info("Starting downloading the warc files")
    warc_dataset = metadata_dataset.map(
        warc_downloader,
        num_proc=args.num_proc,
        features=Features(
            {
                **metadata_dataset.features,
                "warc": Value("binary"),
                "warc_error": Value("string"),
            }
        ),
    )
    logger.info("Finished downloading the warc files")

    logger.info("Starting saving the warc dataset")
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



obelics/callers/download_warc.py [48:68]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
    if ("warc" not in metadata_dataset.column_names) and ("warc_error" not in metadata_dataset.column_names):
        metadata_dataset = metadata_dataset.add_column("warc", [b""] * len(metadata_dataset))
        metadata_dataset = metadata_dataset.add_column("warc_error", [""] * len(metadata_dataset))
    logger.info("Finished loading the metadata or previous warc dataset")

    warc_downloader = WarcDownloader()
    logger.info("Starting downloading the warc files")
    warc_dataset = metadata_dataset.map(
        warc_downloader,
        num_proc=args.num_proc,
        features=Features(
            {
                **metadata_dataset.features,
                "warc": Value("binary"),
                "warc_error": Value("string"),
            }
        ),
    )
    logger.info("Finished downloading the warc files")

    logger.info("Starting saving the warc dataset")
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



