def _generate_examples()

in vision/m4/sourcing/pmd/jz_loaders/jz_wit/jz_wit.py [0:0]


    def _generate_examples(self, files, paths):
        image_paths = {}
        with open(paths, encoding="utf-8") as f:
            for instance in csv.DictReader(f, delimiter="\t"):
                image_paths[instance["url"]] = (
                    bool(instance["downloaded"]),
                    os.path.join(os.environ["DSDIR"], "WIT", instance["path"]),
                )
        logger.info("Loaded paths.")

        buffer = []
        for file in files:
            with open(file, "r", encoding="utf-8") as f:
                examples = csv.DictReader(f, delimiter="\t")
                for example in examples:
                    caption = None
                    if example["language"] != "en":
                        continue

                    if example["caption_reference_description"] is not None:
                        caption = example["caption_reference_description"]
                        if len(caption.split(" ")) < 2:
                            caption = None

                    if caption is None and example["caption_attribution_description"] is not None:
                        if "english:" in example["caption_attribution_description"].lower():
                            attribution = example["caption_attribution_description"]
                            # Splits usually occurs as "English: [Text] Italian: [Text]""
                            # so we split and take relevant section
                            splits = attribution.split(": ")
                            for idx, split in enumerate(splits):
                                clean = split.strip().lower()
                                if clean.endswith("english") or clean.startswith("english"):
                                    if idx + 1 < len(splits):
                                        caption = splits[idx + 1].strip()
                                    # Case of extra languages at the end: English: [Text] Italian: [Text]
                                    if idx + 2 < len(splits):
                                        caption = " ".join(caption.split(" ")[:-1]).strip()

                        if caption is not None and len(caption.split(" ")) < 3:
                            caption = None

                    if caption is None and example["caption_alt_text_description"] is not None:
                        caption = example["caption_alt_text_description"]
                        if len(caption.split(" ")) < 3:
                            caption = None

                    if caption is None:
                        continue

                    caption = "".join(c for c in caption if ord(c) < 128)
                    if len(caption) == 0:
                        continue

                    metadata_dict = {
                        "caption_reference_description": example["caption_reference_description"],
                        "caption_attribution_description": example["caption_attribution_description"],
                        "caption_alt_text_description": example["caption_alt_text_description"],
                        "page_url": example["page_url"],
                        "page_title": example["page_title"],
                        "section_title": example["section_title"],
                        "hierarchical_section_title": example["hierarchical_section_title"],
                        "mime_type": example["mime_type"],
                        "original_height": example["original_height"],
                        "original_width": example["original_width"],
                        "attribution_passes_lang_id": example["attribution_passes_lang_id"],
                        "page_changed_recently": example["page_changed_recently"],
                        "context_page_description": example["context_page_description"],
                        "context_section_description": example["context_section_description"],
                        "image_url": example["image_url"],
                        "language": example["language"],
                        "is_main_image": example["is_main_image"],
                    }
                    image_downloaded, image_path = image_paths[example["image_url"]]
                    if image_downloaded:
                        buffer.append(
                            {
                                "image_path": image_path,
                                "text": caption,
                                "source": "google/wit",
                                "meta": json.dumps(metadata_dict, default=json_serializer, indent=2),
                            }
                        )
                    else:
                        buffer.append(
                            {
                                "image_path": None,
                                "text": caption,
                                "source": "google/wit",
                                "meta": json.dumps(metadata_dict, default=json_serializer, indent=2),
                            }
                        )
                    if len(buffer) == self.CHUNK_SIZE:
                        yield buffer
                        buffer = []
        if len(buffer) > 0:
            yield buffer