vision/m4/sourcing/data_collection/callers/extract_image_text_pairs.py (213 lines of code) (raw):

import argparse import jsonlines from m4.sourcing.data_collection.processors import ( DOMTreeSimplificator, PreExtractionSimplificator, TextMediaPairsExtractor, ) from m4.sourcing.data_collection.utils import load_dataset_html def extract_image_text_pairs( num_min_docs_to_consider=1_000, num_min_images_to_consider=1_000, strip_multiple_linebreaks=True, strip_multiple_spaces=True, remove_html_comments=True, replace_line_break_tags=True, unwrap_tags=True, strip_tags=True, strip_special_divs=True, remove_dates=True, remove_empty_leaves=True, unnest_nodes=True, remake_tree=True, only_text_image_nodes=True, format_texts=True, merge_consecutive_text_nodes=True, also_extract_images_not_in_simplified_dom_tree=True, extract_clip_scores=True, print_results=True, save_file=True, save_txt_format=False, return_pairs=False, ): dom_tree_simplificator = DOMTreeSimplificator( strip_multiple_linebreaks=strip_multiple_linebreaks, strip_multiple_spaces=strip_multiple_spaces, remove_html_comments=remove_html_comments, replace_line_break_tags=replace_line_break_tags, unwrap_tags=unwrap_tags, strip_tags=strip_tags, strip_special_divs=strip_special_divs, remove_dates=remove_dates, remove_empty_leaves=remove_empty_leaves, unnest_nodes=unnest_nodes, remake_tree=remake_tree, ) pre_extraction_simplificator = PreExtractionSimplificator( only_text_image_nodes=only_text_image_nodes, format_texts=format_texts, merge_consecutive_text_nodes=merge_consecutive_text_nodes, ) extractor = TextMediaPairsExtractor( dom_tree_simplificator=dom_tree_simplificator, pre_extraction_simplificator=pre_extraction_simplificator, also_extract_images_not_in_simplified_dom_tree=also_extract_images_not_in_simplified_dom_tree, extract_clip_scores=extract_clip_scores, ) def extraction_from_one_example(example): html_str = example["html"] url = example["url"] extraction = extractor(html_str, url) return extraction dataset = load_dataset_html() images = [] current_doc = 0 while not ((len(images) >= num_min_images_to_consider) and (current_doc >= num_min_docs_to_consider)): # As soon as we reach both of the minimum counts, we exit example = next(dataset) extraction = extraction_from_one_example(example) images += extraction current_doc += 1 if current_doc % 100 == 0: print(f"Extraction done for {current_doc} documents. Extracted {len(images)} images.") if print_results: print(f"{len(images)} images extracted") print( "Number of images with alt text of more than 3 words:", sum(1 for image in images if ("alt_text" in image) and len(image["alt_text"].split(" ")) >= 3), ) print( "Number of images with text: ", sum(1 for image in images if "extracted_text" in image), ) if save_file: with jsonlines.open("outputs/image_text_pairs.jsonl", "w") as writer: writer.write_all(images) if save_txt_format: valid_keys = ["document_url", "src", "alt_text", "extracted_text"] with open("outputs/image_text_pairs.txt", "w") as f: for image in images: for key in valid_keys: if key in image: f.write(f"{key}: {image[key]}\n") f.write("\n\n\n\n") if return_pairs: return images if __name__ == "__main__": parser = argparse.ArgumentParser( description="Extracting (text, media) pairs from potentialy simplified HTML DOMs." ) parser.add_argument( "--num_min_docs_to_consider", type=int, default=1_000, ) parser.add_argument( "--num_min_images_to_consider", type=int, default=1_000, ) parser.add_argument( "--strip_multiple_linebreaks", action="store_true", ) parser.add_argument( "--strip_multiple_spaces", action="store_true", ) parser.add_argument( "--remove_html_comments", action="store_true", ) parser.add_argument( "--replace_line_break_tags", action="store_true", ) parser.add_argument( "--unwrap_tags", action="store_true", ) parser.add_argument( "--strip_tags", action="store_true", ) parser.add_argument( "--strip_special_divs", action="store_true", ) parser.add_argument( "--remove_dates", action="store_true", ) parser.add_argument( "--remove_empty_leaves", action="store_true", ) parser.add_argument( "--unnest_nodes", action="store_true", ) parser.add_argument( "--remake_tree", action="store_true", ) parser.add_argument( "--only_text_image_nodes", action="store_true", ) parser.add_argument( "--format_texts", action="store_true", ) parser.add_argument( "--merge_consecutive_text_nodes", action="store_true", ) parser.add_argument( "--also_extract_images_not_in_simplified_dom_tree", action="store_true", ) parser.add_argument( "--extract_clip_scores", action="store_true", ) parser.add_argument( "--print_results", action="store_true", ) parser.add_argument( "--save_file", action="store_true", ) parser.add_argument( "--save_txt_format", action="store_true", ) parser.add_argument( "--return_pairs", action="store_true", ) args = parser.parse_args() extract_image_text_pairs( num_min_docs_to_consider=args.num_min_docs_to_consider, num_min_images_to_consider=args.num_min_images_to_consider, strip_multiple_linebreaks=args.strip_multiple_linebreaks, strip_multiple_spaces=args.strip_multiple_spaces, remove_html_comments=args.remove_html_comments, replace_line_break_tags=args.replace_line_break_tags, unwrap_tags=args.unwrap_tags, strip_tags=args.strip_tags, strip_special_divs=args.strip_special_divs, remove_dates=args.remove_dates, remove_empty_leaves=args.remove_empty_leaves, unnest_nodes=args.unnest_nodes, remake_tree=args.remake_tree, only_text_image_nodes=args.only_text_image_nodes, format_texts=args.format_texts, merge_consecutive_text_nodes=args.merge_consecutive_text_nodes, print_results=args.print_results, also_extract_images_not_in_simplified_dom_tree=args.also_extract_images_not_in_simplified_dom_tree, extract_clip_scores=args.extract_clip_scores, save_file=args.save_file, save_txt_format=args.save_txt_format, return_pairs=args.return_pairs, )