Path Lines of Code build_obelics/01_download_warc.py 89 build_obelics/02_bis_extract_html_get_image_urls_new_rules.py 140 build_obelics/02_extract_html_get_image_urls.py 144 build_obelics/02_parallel_extract_html_get_image_urls.py 113 build_obelics/03_dl_images_create_dataset.py 163 build_obelics/03_parallel_dl_images_create_dataset.py 31 build_obelics/04_merge_web_docs_with_images.py 140 build_obelics/05_filtering_web_docs.py 225 build_obelics/06_01_create_set_image_urls_in_webdocs.py 90 build_obelics/06_02_merge_sets_image_urls_in_webdocs.py 48 build_obelics/06_03_remove_image_duplicates.py 123 build_obelics/07_01_nsfw_image_filtering.py 175 build_obelics/07_02_nsfw_image_visualization.py 68 build_obelics/07_03_nsfw_image_removal.py 77 build_obelics/08_01_prepare_urldedup.py 64 build_obelics/08_02_urldedup.py 82 build_obelics/09_01_create_web_docs_texts_only.py 49 build_obelics/09_02_get_domain_to_positions.py 62 build_obelics/09_03_split_domain_to_positions.py 34 build_obelics/09_04_get_domain_to_duplicated_texts.py 90 build_obelics/09_05_merge_domain_to_duplicated_texts_sharded.py 86 build_obelics/09_06_line_dedup.py 91 build_obelics/09_07_merge_web_docs_texts_only_and_rest.py 64 build_obelics/10_final_cleaning.py 113 build_obelics/11_01_create_set_img_urls.py 50 build_obelics/11_02_get_docs_to_remove_by_set_img_urls_dedup.py 75 build_obelics/11_03_set_img_urls_dedup.py 94 build_obelics/12_01_find_opt_out_images.py 79 build_obelics/12_02_remove_opt_out_images.py 74 build_obelics/13_final_processing.py 276