path # lines of code build_obelics/08_01_prepare_urldedup.py 64 build_obelics/02_extract_html_get_image_urls.py 144 build_obelics/01_download_warc.py 89 build_obelics/06_02_merge_sets_image_urls_in_webdocs.py 48 build_obelics/02_parallel_extract_html_get_image_urls.py 113 build_obelics/09_07_merge_web_docs_texts_only_and_rest.py 64 build_obelics/11_02_get_docs_to_remove_by_set_img_urls_dedup.py 75 build_obelics/10_final_cleaning.py 113 build_obelics/09_05_merge_domain_to_duplicated_texts_sharded.py 86 build_obelics/03_parallel_dl_images_create_dataset.py 31 build_obelics/09_06_line_dedup.py 91 build_obelics/09_04_get_domain_to_duplicated_texts.py 90 build_obelics/04_merge_web_docs_with_images.py 140 build_obelics/05_filtering_web_docs.py 225 build_obelics/03_dl_images_create_dataset.py 163 build_obelics/06_01_create_set_image_urls_in_webdocs.py 90 build_obelics/02_bis_extract_html_get_image_urls_new_rules.py 140 build_obelics/08_02_urldedup.py 82 build_obelics/11_01_create_set_img_urls.py 50 build_obelics/09_02_get_domain_to_positions.py 62 build_obelics/11_03_set_img_urls_dedup.py 94 build_obelics/12_01_find_opt_out_images.py 79 build_obelics/07_02_nsfw_image_visualization.py 68 build_obelics/07_01_nsfw_image_filtering.py 175 build_obelics/07_03_nsfw_image_removal.py 77 build_obelics/09_01_create_web_docs_texts_only.py 49 build_obelics/12_02_remove_opt_out_images.py 74 build_obelics/13_final_processing.py 276 build_obelics/06_03_remove_image_duplicates.py 123 build_obelics/09_03_split_domain_to_positions.py 34 obelics/callers/__init__.py 1 obelics/callers/extract_web_documents.py 176 obelics/callers/extract_html.py 53 obelics/callers/filter_web_documents.py 196 obelics/callers/download_warc.py 63 obelics/callers/line_deduplicate_web_documents.py 65 obelics/configs/config_extract_web_documents.yaml 23 obelics/configs/config_filter_web_documents.yaml 64 obelics/utils/tags_attributes.py 82 obelics/utils/__init__.py 24 obelics/utils/simplification_utils.py 176 obelics/utils/utils.py 4 obelics/utils/filtering_utils.py 427 obelics/__init__.py 1 obelics/processors/warc_downloader.py 36 obelics/processors/__init__.py 11 obelics/processors/pre_extraction_simplificator.py 152 obelics/processors/dom_tree_simplificator.py 195 obelics/processors/html_extractor.py 52 obelics/processors/web_document_extractor.py 371 obelics/processors/web_document_line_deduplication.py 121 obelics/processors/web_document_filtering.py 1002 obelics/visualization/web_document_visualization.py 59 obelics/visualization/web_document_and_filtering_visualization.py 675 obelics/visualization/__init__.py 1 obelics/visualization/assets/DOM_tree_viz.html 172 obelics/visualization/global_visualization.py 321 obelics/visualization/choose_filtering_parameters_web_documents_node_level.py 205