obelics/callers/extract_web_documents.py (176 lines of code) (raw):

import argparse import logging from multiprocessing import cpu_count import yaml from datasets import load_from_disk from obelics.processors import ( CommonCrawlWebDocumentExtractor, DOMTreeSimplificator, PreExtractionSimplificator, ) logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", ) logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) def get_args(): parser = argparse.ArgumentParser(description="Extract web documents.") parser.add_argument( "--path_config_extract_web_documents", type=str, default="./obelics/configs/config_extract_web_documents.yaml", help="The path of the config file containing the extraction parameters.", ) parser.add_argument( "--path_html_dataset", type=str, default="./large_files/html_documents_10000", help="Path of the dataset containing the HTML documents.", ) parser.add_argument( "--path_save_dir_dataset", type=str, default="./large_files/output_extraction/web_documents_10000", help="The directory to save the dataset.", ) parser.add_argument( "--num_proc", type=int, default=cpu_count(), help="Number of processes to use for the multiprocessing.", ) parser.add_argument( "--path_save_file_image_urls", type=str, default="./large_files/output_extraction/image_urls.txt", help="The file to save the urls of all images.", ) parser.add_argument( "--path_save_dir_downloaded_images", type=str, default="./large_files/output_extraction/downloaded_images", help="The directory to save all images.", ) parser.add_argument( "--thread_count", type=int, default=256, help="The number of threads used for downloading the pictures.", ) parser.add_argument( "--number_sample_per_shard", type=int, default=10_000, help="The number of images that will be downloaded in one shard.", ) parser.add_argument( "--path_save_dir_tmp_datasets_images", type=str, default="./large_files/output_extraction/tmp_datasets_images", help=( "The directory to save the temporary datasets containing all images (useful for the code but can be" " forgotten after)." ), ) parser.add_argument( "--path_save_dir_dataset_images", type=str, default="./large_files/output_extraction/dataset_images", help="The directory to save the dataset containing all images.", ) parser.add_argument( "--path_save_file_map_url_idx", type=str, default="./large_files/output_extraction/map_url_idx.json", help="The file to save the map to go from urls to indices of the dataset containing all images.", ) parser.add_argument( "--num_proc_urls_to_images", type=int, default=15, help="Number of processes to use for the multiprocessing for the step `urls_to_images`. Reduce if OOM errors.", ) parser.add_argument( "--path_save_dir_sharded_dataset", type=str, default="./large_files/output_extraction/web_documents_10000_sharded", help="The directory to save the sharded dataset.", ) parser.add_argument( "--shard_size", type=int, default=20_000, # 500 shards for 10M web documents help="The size of a shard for the sharded dataset.", ) args = parser.parse_args() return args if __name__ == "__main__": args = get_args() logger.info("Starting loading the HTML dataset") html_dataset = load_from_disk(args.path_html_dataset) logger.info("Finished loading the HTML dataset") with open(args.path_config_extract_web_documents) as f: extraction_params = yaml.load(f, Loader=yaml.FullLoader) dom_tree_simplificator = DOMTreeSimplificator( strip_multiple_linebreaks=extraction_params["dom_tree_simplificator"]["strip_multiple_linebreaks"], strip_multiple_spaces=extraction_params["dom_tree_simplificator"]["strip_multiple_spaces"], remove_html_comments=extraction_params["dom_tree_simplificator"]["remove_html_comments"], replace_line_break_tags=extraction_params["dom_tree_simplificator"]["replace_line_break_tags"], unwrap_tags=extraction_params["dom_tree_simplificator"]["unwrap_tags"], strip_tags=extraction_params["dom_tree_simplificator"]["strip_tags"], strip_special_divs=extraction_params["dom_tree_simplificator"]["strip_special_divs"], remove_dates=extraction_params["dom_tree_simplificator"]["remove_dates"], remove_empty_leaves=extraction_params["dom_tree_simplificator"]["remove_empty_leaves"], unnest_nodes=extraction_params["dom_tree_simplificator"]["unnest_nodes"], remake_tree=extraction_params["dom_tree_simplificator"]["remake_tree"], css_rules=extraction_params["dom_tree_simplificator"]["css_rules"], css_rules_replace_with_text=extraction_params["dom_tree_simplificator"]["css_rules_replace_with_text"], ) pre_extraction_simplificator = PreExtractionSimplificator( only_text_image_nodes=extraction_params["pre_extraction_simplificator"]["only_text_image_nodes"], format_texts=extraction_params["pre_extraction_simplificator"]["format_texts"], merge_consecutive_text_nodes=extraction_params["pre_extraction_simplificator"]["merge_consecutive_text_nodes"], ) path_save_dir_dataset = args.path_save_dir_dataset num_proc = args.num_proc path_save_file_image_urls = args.path_save_file_image_urls path_save_dir_downloaded_images = args.path_save_dir_downloaded_images thread_count = args.thread_count number_sample_per_shard = args.number_sample_per_shard image_size = extraction_params["web_document_extractor"]["image_size"] resize_mode = extraction_params["web_document_extractor"]["resize_mode"] path_save_dir_tmp_datasets_images = args.path_save_dir_tmp_datasets_images path_save_dir_dataset_images = args.path_save_dir_dataset_images path_save_file_map_url_idx = args.path_save_file_map_url_idx num_proc_urls_to_images = args.num_proc_urls_to_images path_save_dir_sharded_dataset = args.path_save_dir_sharded_dataset shard_size = args.shard_size web_document_extractor = CommonCrawlWebDocumentExtractor( html_dataset=html_dataset, dom_tree_simplificator=dom_tree_simplificator, pre_extraction_simplificator=pre_extraction_simplificator, path_save_dir_dataset=path_save_dir_dataset, num_proc=num_proc, path_save_file_image_urls=path_save_file_image_urls, path_save_dir_downloaded_images=path_save_dir_downloaded_images, thread_count=thread_count, number_sample_per_shard=number_sample_per_shard, image_size=image_size, resize_mode=resize_mode, path_save_dir_tmp_datasets_images=path_save_dir_tmp_datasets_images, path_save_dir_dataset_images=path_save_dir_dataset_images, path_save_file_map_url_idx=path_save_file_map_url_idx, num_proc_urls_to_images=num_proc_urls_to_images, path_save_dir_sharded_dataset=path_save_dir_sharded_dataset, shard_size=shard_size, ) web_document_extractor.html_to_web_documents() web_document_extractor.get_image_urls() web_document_extractor.download_images() web_document_extractor.create_dataset_images() web_document_extractor.urls_to_images() web_document_extractor.save_dataset() web_document_extractor.save_commit_hash() web_document_extractor.save_split_sharded_dataset()