111 duplicated lines in: - build_obelics/05_filtering_web_docs.py (113:228, 51%) - obelics/callers/filter_web_documents.py (93:208, 59%) 21 duplicated lines in: - build_obelics/02_bis_extract_html_get_image_urls_new_rules.py (96:117, 15%) - obelics/callers/extract_web_documents.py (125:146, 12%) 19 duplicated lines in: - build_obelics/01_download_warc.py (66:86, 22%) - obelics/callers/download_warc.py (48:68, 32%) 12 duplicated lines in: - build_obelics/02_extract_html_get_image_urls.py (50:63, 8%) - obelics/callers/extract_html.py (31:44, 25%) 12 duplicated lines in: - build_obelics/02_extract_html_get_image_urls.py (101:112, 8%) - obelics/visualization/global_visualization.py (51:62, 3%) 12 duplicated lines in: - build_obelics/02_bis_extract_html_get_image_urls_new_rules.py (45:58, 9%) - obelics/callers/extract_html.py (31:44, 25%) 11 duplicated lines in: - build_obelics/02_extract_html_get_image_urls.py (51:63, 7%) - obelics/callers/download_warc.py (32:44, 18%) 11 duplicated lines in: - build_obelics/06_01_create_set_image_urls_in_webdocs.py (39:51, 13%) - obelics/callers/download_warc.py (32:44, 18%) 11 duplicated lines in: - build_obelics/06_01_create_set_image_urls_in_webdocs.py (39:51, 13%) - obelics/callers/extract_html.py (32:44, 22%) 11 duplicated lines in: - build_obelics/02_bis_extract_html_get_image_urls_new_rules.py (46:58, 8%) - obelics/callers/download_warc.py (32:44, 18%) 11 duplicated lines in: - build_obelics/02_extract_html_get_image_urls.py (101:111, 7%) - obelics/processors/dom_tree_simplificator.py (15:25, 5%) 10 duplicated lines in: - build_obelics/02_bis_extract_html_get_image_urls_new_rules.py (13:26, 7%) - obelics/callers/extract_web_documents.py (11:24, 5%) 10 duplicated lines in: - build_obelics/01_download_warc.py (11:22, 12%) - obelics/callers/download_warc.py (10:21, 17%) 10 duplicated lines in: - build_obelics/05_filtering_web_docs.py (54:63, 4%) - obelics/callers/filter_web_documents.py (47:56, 5%) 9 duplicated lines in: - build_obelics/04_merge_web_docs_with_images.py (59:67, 6%) - obelics/callers/extract_html.py (32:40, 18%) 9 duplicated lines in: - build_obelics/03_dl_images_create_dataset.py (53:61, 5%) - obelics/callers/extract_web_documents.py (66:74, 5%) 9 duplicated lines in: - build_obelics/04_merge_web_docs_with_images.py (59:67, 6%) - obelics/callers/download_warc.py (32:40, 15%) 9 duplicated lines in: - build_obelics/09_06_line_dedup.py (52:60, 10%) - obelics/processors/web_document_line_deduplication.py (114:122, 7%) 9 duplicated lines in: - build_obelics/02_extract_html_get_image_urls.py (13:25, 6%) - obelics/callers/extract_web_documents.py (12:24, 5%) 9 duplicated lines in: - build_obelics/05_filtering_web_docs.py (12:23, 4%) - obelics/callers/filter_web_documents.py (11:22, 4%) 9 duplicated lines in: - build_obelics/05_filtering_web_docs.py (12:23, 4%) - obelics/visualization/web_document_and_filtering_visualization.py (13:23, 1%) 8 duplicated lines in: - build_obelics/02_extract_html_get_image_urls.py (16:25, 5%) - obelics/callers/filter_web_documents.py (25:34, 4%) 8 duplicated lines in: - build_obelics/02_bis_extract_html_get_image_urls_new_rules.py (17:26, 6%) - obelics/callers/extract_html.py (10:19, 16%) 8 duplicated lines in: - build_obelics/05_filtering_web_docs.py (12:19, 3%) - obelics/visualization/choose_filtering_parameters_web_documents_node_level.py (16:23, 4%) 8 duplicated lines in: - build_obelics/04_merge_web_docs_with_images.py (18:27, 6%) - obelics/callers/extract_web_documents.py (15:24, 4%) 8 duplicated lines in: - build_obelics/03_dl_images_create_dataset.py (9:18, 5%) - obelics/callers/download_warc.py (10:19, 13%) 8 duplicated lines in: - build_obelics/01_download_warc.py (11:20, 9%) - obelics/callers/extract_html.py (10:19, 16%) 8 duplicated lines in: - build_obelics/04_merge_web_docs_with_images.py (18:27, 6%) - obelics/callers/download_warc.py (10:19, 13%) 8 duplicated lines in: - build_obelics/03_dl_images_create_dataset.py (9:18, 5%) - obelics/callers/extract_html.py (10:19, 16%) 8 duplicated lines in: - build_obelics/02_bis_extract_html_get_image_urls_new_rules.py (17:26, 6%) - obelics/callers/download_warc.py (10:19, 13%) 8 duplicated lines in: - build_obelics/06_01_create_set_image_urls_in_webdocs.py (12:21, 9%) - obelics/callers/extract_web_documents.py (15:24, 4%) 8 duplicated lines in: - build_obelics/05_filtering_web_docs.py (12:19, 3%) - obelics/utils/__init__.py (2:9, 40%) 8 duplicated lines in: - build_obelics/05_filtering_web_docs.py (27:36, 3%) - obelics/callers/download_warc.py (10:19, 13%) 8 duplicated lines in: - build_obelics/06_03_remove_image_duplicates.py (15:24, 6%) - obelics/callers/download_warc.py (10:19, 13%) 8 duplicated lines in: - build_obelics/02_bis_extract_html_get_image_urls_new_rules.py (17:26, 6%) - obelics/callers/filter_web_documents.py (25:34, 4%) 8 duplicated lines in: - build_obelics/03_dl_images_create_dataset.py (83:90, 5%) - obelics/callers/extract_web_documents.py (78:85, 4%) 8 duplicated lines in: - build_obelics/06_01_create_set_image_urls_in_webdocs.py (12:21, 9%) - obelics/callers/extract_html.py (10:19, 16%) 8 duplicated lines in: - build_obelics/06_01_create_set_image_urls_in_webdocs.py (12:21, 9%) - obelics/callers/filter_web_documents.py (25:34, 4%) 8 duplicated lines in: - build_obelics/02_extract_html_get_image_urls.py (16:25, 5%) - obelics/callers/download_warc.py (10:19, 13%) 8 duplicated lines in: - build_obelics/03_dl_images_create_dataset.py (72:79, 5%) - obelics/callers/extract_web_documents.py (43:50, 4%) 8 duplicated lines in: - build_obelics/02_bis_extract_html_get_image_urls_new_rules.py (75:83, 6%) - obelics/callers/extract_html.py (48:56, 16%) 8 duplicated lines in: - build_obelics/06_03_remove_image_duplicates.py (15:24, 6%) - obelics/callers/extract_html.py (10:19, 16%) 8 duplicated lines in: - build_obelics/04_merge_web_docs_with_images.py (18:27, 6%) - obelics/callers/filter_web_documents.py (25:34, 4%) 8 duplicated lines in: - build_obelics/03_dl_images_create_dataset.py (9:18, 5%) - obelics/callers/filter_web_documents.py (25:34, 4%) 8 duplicated lines in: - build_obelics/02_extract_html_get_image_urls.py (16:25, 5%) - obelics/callers/extract_html.py (10:19, 16%) 8 duplicated lines in: - build_obelics/05_filtering_web_docs.py (27:36, 3%) - obelics/callers/extract_html.py (10:19, 16%) 8 duplicated lines in: - build_obelics/05_filtering_web_docs.py (54:61, 3%) - obelics/callers/extract_web_documents.py (43:50, 4%) 8 duplicated lines in: - build_obelics/01_download_warc.py (11:20, 9%) - obelics/callers/filter_web_documents.py (25:34, 4%) 8 duplicated lines in: - build_obelics/03_dl_images_create_dataset.py (72:79, 5%) - obelics/callers/filter_web_documents.py (47:54, 4%) 8 duplicated lines in: - build_obelics/04_merge_web_docs_with_images.py (18:27, 6%) - obelics/callers/extract_html.py (10:19, 16%) 8 duplicated lines in: - build_obelics/03_dl_images_create_dataset.py (9:18, 5%) - obelics/callers/extract_web_documents.py (15:24, 4%) 8 duplicated lines in: - build_obelics/05_filtering_web_docs.py (27:36, 3%) - obelics/callers/extract_web_documents.py (15:24, 4%) 8 duplicated lines in: - build_obelics/06_03_remove_image_duplicates.py (15:24, 6%) - obelics/callers/filter_web_documents.py (25:34, 4%) 8 duplicated lines in: - build_obelics/06_03_remove_image_duplicates.py (15:24, 6%) - obelics/callers/extract_web_documents.py (15:24, 4%) 8 duplicated lines in: - build_obelics/06_01_create_set_image_urls_in_webdocs.py (12:21, 9%) - obelics/callers/download_warc.py (10:19, 13%) 8 duplicated lines in: - build_obelics/05_filtering_web_docs.py (27:36, 3%) - obelics/callers/filter_web_documents.py (25:34, 4%) 8 duplicated lines in: - build_obelics/02_extract_html_get_image_urls.py (82:90, 5%) - obelics/callers/extract_html.py (48:56, 16%) 8 duplicated lines in: - build_obelics/01_download_warc.py (11:20, 9%) - obelics/callers/extract_web_documents.py (15:24, 4%) 7 duplicated lines in: - build_obelics/02_extract_html_get_image_urls.py (16:22, 5%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/08_02_urldedup.py (15:21, 9%) - obelics/callers/download_warc.py (10:16, 12%) 7 duplicated lines in: - build_obelics/11_02_get_docs_to_remove_by_set_img_urls_dedup.py (15:21, 10%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/09_04_get_domain_to_duplicated_texts.py (10:16, 8%) - obelics/callers/filter_web_documents.py (25:31, 3%) 7 duplicated lines in: - build_obelics/12_02_remove_opt_out_images.py (16:22, 10%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/11_01_create_set_img_urls.py (9:15, 15%) - obelics/callers/download_warc.py (10:16, 12%) 7 duplicated lines in: - build_obelics/07_01_nsfw_image_filtering.py (24:30, 4%) - obelics/callers/download_warc.py (10:16, 12%) 7 duplicated lines in: - build_obelics/07_01_nsfw_image_filtering.py (24:30, 4%) - obelics/callers/extract_html.py (10:16, 14%) 7 duplicated lines in: - build_obelics/09_06_line_dedup.py (10:16, 8%) - obelics/callers/extract_web_documents.py (15:21, 4%) 7 duplicated lines in: - build_obelics/07_01_nsfw_image_filtering.py (24:30, 4%) - obelics/callers/filter_web_documents.py (25:31, 3%) 7 duplicated lines in: - build_obelics/04_merge_web_docs_with_images.py (59:65, 5%) - obelics/callers/filter_web_documents.py (47:53, 3%) 7 duplicated lines in: - build_obelics/09_04_get_domain_to_duplicated_texts.py (10:16, 8%) - obelics/callers/download_warc.py (10:16, 12%) 7 duplicated lines in: - build_obelics/05_filtering_web_docs.py (54:60, 3%) - obelics/callers/extract_html.py (32:38, 14%) 7 duplicated lines in: - build_obelics/02_bis_extract_html_get_image_urls_new_rules.py (17:23, 5%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/11_03_set_img_urls_dedup.py (17:23, 8%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/10_final_cleaning.py (19:25, 6%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/10_final_cleaning.py (19:25, 6%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/09_05_merge_domain_to_duplicated_texts_sharded.py (14:20, 8%) - obelics/callers/filter_web_documents.py (25:31, 3%) 7 duplicated lines in: - build_obelics/11_02_get_docs_to_remove_by_set_img_urls_dedup.py (15:21, 10%) - obelics/callers/download_warc.py (10:16, 12%) 7 duplicated lines in: - build_obelics/11_03_set_img_urls_dedup.py (17:23, 8%) - obelics/callers/filter_web_documents.py (25:31, 3%) 7 duplicated lines in: - build_obelics/09_07_merge_web_docs_texts_only_and_rest.py (16:22, 12%) - obelics/callers/download_warc.py (10:16, 12%) 7 duplicated lines in: - build_obelics/13_final_processing.py (23:29, 2%) - obelics/callers/filter_web_documents.py (25:31, 3%) 7 duplicated lines in: - build_obelics/05_filtering_web_docs.py (27:33, 3%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/12_02_remove_opt_out_images.py (16:22, 10%) - obelics/callers/download_warc.py (10:16, 12%) 7 duplicated lines in: - build_obelics/01_download_warc.py (11:17, 8%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/11_03_set_img_urls_dedup.py (17:23, 8%) - obelics/callers/extract_web_documents.py (15:21, 4%) 7 duplicated lines in: - build_obelics/08_02_urldedup.py (15:21, 9%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/09_02_get_domain_to_positions.py (10:16, 12%) - obelics/callers/extract_html.py (10:16, 14%) 7 duplicated lines in: - build_obelics/13_final_processing.py (23:29, 2%) - obelics/callers/extract_web_documents.py (15:21, 4%) 7 duplicated lines in: - build_obelics/12_02_remove_opt_out_images.py (16:22, 10%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/09_06_line_dedup.py (10:16, 8%) - obelics/callers/filter_web_documents.py (25:31, 3%) 7 duplicated lines in: - build_obelics/13_final_processing.py (23:29, 2%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/02_extract_html_get_image_urls.py (51:57, 5%) - obelics/callers/filter_web_documents.py (47:53, 3%) 7 duplicated lines in: - build_obelics/06_01_create_set_image_urls_in_webdocs.py (39:45, 8%) - obelics/callers/extract_web_documents.py (43:49, 4%) 7 duplicated lines in: - build_obelics/13_final_processing.py (23:29, 2%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/09_04_get_domain_to_duplicated_texts.py (10:16, 8%) - obelics/callers/extract_web_documents.py (15:21, 4%) 7 duplicated lines in: - build_obelics/11_03_set_img_urls_dedup.py (17:23, 8%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/09_04_get_domain_to_duplicated_texts.py (10:16, 8%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/01_download_warc.py (11:17, 8%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/11_02_get_docs_to_remove_by_set_img_urls_dedup.py (15:21, 10%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/09_07_merge_web_docs_texts_only_and_rest.py (16:22, 12%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/07_03_nsfw_image_removal.py (16:22, 10%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/07_03_nsfw_image_removal.py (16:22, 10%) - obelics/callers/extract_web_documents.py (15:21, 4%) 7 duplicated lines in: - build_obelics/09_07_merge_web_docs_texts_only_and_rest.py (16:22, 12%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/12_02_remove_opt_out_images.py (16:22, 10%) - obelics/callers/extract_html.py (10:16, 14%) 7 duplicated lines in: - build_obelics/12_02_remove_opt_out_images.py (16:22, 10%) - obelics/callers/extract_web_documents.py (15:21, 4%) 7 duplicated lines in: - build_obelics/07_01_nsfw_image_filtering.py (24:30, 4%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/11_02_get_docs_to_remove_by_set_img_urls_dedup.py (15:21, 10%) - obelics/callers/extract_html.py (10:16, 14%) 7 duplicated lines in: - build_obelics/03_dl_images_create_dataset.py (72:78, 4%) - obelics/callers/download_warc.py (32:38, 12%) 7 duplicated lines in: - build_obelics/09_02_get_domain_to_positions.py (10:16, 12%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/09_07_merge_web_docs_texts_only_and_rest.py (16:22, 12%) - obelics/callers/extract_web_documents.py (15:21, 4%) 7 duplicated lines in: - build_obelics/09_05_merge_domain_to_duplicated_texts_sharded.py (14:20, 8%) - obelics/callers/download_warc.py (10:16, 12%) 7 duplicated lines in: - build_obelics/09_04_get_domain_to_duplicated_texts.py (10:16, 8%) - obelics/callers/extract_html.py (10:16, 14%) 7 duplicated lines in: - build_obelics/09_06_line_dedup.py (10:16, 8%) - obelics/callers/download_warc.py (10:16, 12%) 7 duplicated lines in: - build_obelics/09_06_line_dedup.py (10:16, 8%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/09_07_merge_web_docs_texts_only_and_rest.py (16:22, 12%) - obelics/callers/filter_web_documents.py (25:31, 3%) 7 duplicated lines in: - build_obelics/13_final_processing.py (23:29, 2%) - obelics/callers/download_warc.py (10:16, 12%) 7 duplicated lines in: - build_obelics/07_03_nsfw_image_removal.py (16:22, 10%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/02_bis_extract_html_get_image_urls_new_rules.py (46:52, 5%) - obelics/callers/filter_web_documents.py (47:53, 3%) 7 duplicated lines in: - build_obelics/09_07_merge_web_docs_texts_only_and_rest.py (16:22, 12%) - obelics/callers/extract_html.py (10:16, 14%) 7 duplicated lines in: - build_obelics/04_merge_web_docs_with_images.py (18:24, 5%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/09_01_create_web_docs_texts_only.py (9:15, 15%) - obelics/callers/download_warc.py (10:16, 12%) 7 duplicated lines in: - build_obelics/08_02_urldedup.py (15:21, 9%) - obelics/callers/filter_web_documents.py (25:31, 3%) 7 duplicated lines in: - build_obelics/09_02_get_domain_to_positions.py (10:16, 12%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/03_dl_images_create_dataset.py (9:15, 4%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/09_02_get_domain_to_positions.py (10:16, 12%) - obelics/callers/extract_web_documents.py (15:21, 4%) 7 duplicated lines in: - build_obelics/10_final_cleaning.py (19:25, 6%) - obelics/callers/extract_html.py (10:16, 14%) 7 duplicated lines in: - build_obelics/07_03_nsfw_image_removal.py (16:22, 10%) - obelics/callers/filter_web_documents.py (25:31, 3%) 7 duplicated lines in: - build_obelics/06_01_create_set_image_urls_in_webdocs.py (12:18, 8%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/12_02_remove_opt_out_images.py (16:22, 10%) - obelics/callers/filter_web_documents.py (25:31, 3%) 7 duplicated lines in: - build_obelics/09_05_merge_domain_to_duplicated_texts_sharded.py (14:20, 8%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/10_final_cleaning.py (19:25, 6%) - obelics/callers/download_warc.py (10:16, 12%) 7 duplicated lines in: - build_obelics/02_bis_extract_html_get_image_urls_new_rules.py (46:52, 5%) - obelics/callers/extract_web_documents.py (43:49, 4%) 7 duplicated lines in: - build_obelics/07_03_nsfw_image_removal.py (16:22, 10%) - obelics/callers/download_warc.py (10:16, 12%) 7 duplicated lines in: - build_obelics/08_02_urldedup.py (15:21, 9%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/11_02_get_docs_to_remove_by_set_img_urls_dedup.py (15:21, 10%) - obelics/callers/filter_web_documents.py (25:31, 3%) 7 duplicated lines in: - build_obelics/02_bis_extract_html_get_image_urls_new_rules.py (17:23, 5%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/09_01_create_web_docs_texts_only.py (9:15, 15%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/11_01_create_set_img_urls.py (9:15, 15%) - obelics/callers/filter_web_documents.py (25:31, 3%) 7 duplicated lines in: - build_obelics/09_02_get_domain_to_positions.py (10:16, 12%) - obelics/callers/download_warc.py (10:16, 12%) 7 duplicated lines in: - build_obelics/08_02_urldedup.py (15:21, 9%) - obelics/callers/extract_html.py (10:16, 14%) 7 duplicated lines in: - build_obelics/02_extract_html_get_image_urls.py (16:22, 5%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/09_05_merge_domain_to_duplicated_texts_sharded.py (14:20, 8%) - obelics/callers/extract_web_documents.py (15:21, 4%) 7 duplicated lines in: - build_obelics/11_03_set_img_urls_dedup.py (17:23, 8%) - obelics/callers/download_warc.py (10:16, 12%) 7 duplicated lines in: - build_obelics/11_01_create_set_img_urls.py (9:15, 15%) - obelics/callers/extract_web_documents.py (15:21, 4%) 7 duplicated lines in: - build_obelics/09_06_line_dedup.py (10:16, 8%) - obelics/callers/extract_html.py (10:16, 14%) 7 duplicated lines in: - build_obelics/05_filtering_web_docs.py (27:33, 3%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/11_01_create_set_img_urls.py (9:15, 15%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/10_final_cleaning.py (19:25, 6%) - obelics/callers/extract_web_documents.py (15:21, 4%) 7 duplicated lines in: - build_obelics/09_05_merge_domain_to_duplicated_texts_sharded.py (14:20, 8%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/08_02_urldedup.py (15:21, 9%) - obelics/callers/extract_web_documents.py (15:21, 4%) 7 duplicated lines in: - build_obelics/09_02_get_domain_to_positions.py (10:16, 12%) - obelics/callers/filter_web_documents.py (25:31, 3%) 7 duplicated lines in: - build_obelics/07_01_nsfw_image_filtering.py (24:30, 4%) - obelics/callers/extract_web_documents.py (15:21, 4%) 7 duplicated lines in: - build_obelics/09_04_get_domain_to_duplicated_texts.py (10:16, 8%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/06_03_remove_image_duplicates.py (15:21, 6%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/11_01_create_set_img_urls.py (9:15, 15%) - obelics/callers/extract_html.py (10:16, 14%) 7 duplicated lines in: - build_obelics/09_01_create_web_docs_texts_only.py (9:15, 15%) - obelics/callers/extract_html.py (10:16, 14%) 7 duplicated lines in: - build_obelics/11_03_set_img_urls_dedup.py (17:23, 8%) - obelics/callers/extract_html.py (10:16, 14%) 7 duplicated lines in: - build_obelics/03_dl_images_create_dataset.py (72:78, 4%) - obelics/callers/extract_html.py (32:38, 14%) 7 duplicated lines in: - build_obelics/10_final_cleaning.py (19:25, 6%) - obelics/callers/filter_web_documents.py (25:31, 3%) 7 duplicated lines in: - build_obelics/06_01_create_set_image_urls_in_webdocs.py (39:45, 8%) - obelics/callers/filter_web_documents.py (47:53, 3%) 7 duplicated lines in: - build_obelics/11_02_get_docs_to_remove_by_set_img_urls_dedup.py (15:21, 10%) - obelics/callers/extract_web_documents.py (15:21, 4%) 7 duplicated lines in: - build_obelics/07_01_nsfw_image_filtering.py (24:30, 4%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/09_06_line_dedup.py (10:16, 8%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/09_01_create_web_docs_texts_only.py (9:15, 15%) - obelics/callers/filter_web_documents.py (25:31, 3%) 7 duplicated lines in: - build_obelics/02_extract_html_get_image_urls.py (51:57, 5%) - obelics/callers/extract_web_documents.py (43:49, 4%) 7 duplicated lines in: - build_obelics/09_01_create_web_docs_texts_only.py (9:15, 15%) - obelics/processors/web_document_line_deduplication.py (11:17, 6%) 7 duplicated lines in: - build_obelics/05_filtering_web_docs.py (54:60, 3%) - obelics/callers/download_warc.py (32:38, 12%) 7 duplicated lines in: - build_obelics/07_03_nsfw_image_removal.py (16:22, 10%) - obelics/callers/extract_html.py (10:16, 14%) 7 duplicated lines in: - build_obelics/06_03_remove_image_duplicates.py (15:21, 6%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/09_01_create_web_docs_texts_only.py (9:15, 15%) - obelics/callers/extract_web_documents.py (15:21, 4%) 7 duplicated lines in: - build_obelics/04_merge_web_docs_with_images.py (18:24, 5%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/04_merge_web_docs_with_images.py (59:65, 5%) - obelics/callers/extract_web_documents.py (43:49, 4%) 7 duplicated lines in: - build_obelics/09_05_merge_domain_to_duplicated_texts_sharded.py (14:20, 8%) - obelics/callers/extract_html.py (10:16, 14%) 7 duplicated lines in: - build_obelics/06_01_create_set_image_urls_in_webdocs.py (12:18, 8%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/03_dl_images_create_dataset.py (9:15, 4%) - obelics/processors/web_document_extractor.py (15:21, 1%) 7 duplicated lines in: - build_obelics/13_final_processing.py (23:29, 2%) - obelics/callers/extract_html.py (10:16, 14%) 7 duplicated lines in: - build_obelics/11_01_create_set_img_urls.py (9:15, 15%) - obelics/processors/web_document_extractor.py (15:21, 1%) 6 duplicated lines in: - build_obelics/01_download_warc.py (43:50, 7%) - obelics/callers/download_warc.py (37:44, 10%) 6 duplicated lines in: - build_obelics/05_filtering_web_docs.py (89:96, 2%) - obelics/callers/filter_web_documents.py (82:89, 3%) 6 duplicated lines in: - build_obelics/01_download_warc.py (43:50, 7%) - obelics/callers/extract_html.py (37:44, 12%)