in vision/m4/sourcing/pmd/jz_loaders/jz_wit/jz_wit.py [0:0]
def _generate_examples(self, files, paths):
image_paths = {}
with open(paths, encoding="utf-8") as f:
for instance in csv.DictReader(f, delimiter="\t"):
image_paths[instance["url"]] = (
bool(instance["downloaded"]),
os.path.join(os.environ["DSDIR"], "WIT", instance["path"]),
)
logger.info("Loaded paths.")
buffer = []
for file in files:
with open(file, "r", encoding="utf-8") as f:
examples = csv.DictReader(f, delimiter="\t")
for example in examples:
caption = None
if example["language"] != "en":
continue
if example["caption_reference_description"] is not None:
caption = example["caption_reference_description"]
if len(caption.split(" ")) < 2:
caption = None
if caption is None and example["caption_attribution_description"] is not None:
if "english:" in example["caption_attribution_description"].lower():
attribution = example["caption_attribution_description"]
# Splits usually occurs as "English: [Text] Italian: [Text]""
# so we split and take relevant section
splits = attribution.split(": ")
for idx, split in enumerate(splits):
clean = split.strip().lower()
if clean.endswith("english") or clean.startswith("english"):
if idx + 1 < len(splits):
caption = splits[idx + 1].strip()
# Case of extra languages at the end: English: [Text] Italian: [Text]
if idx + 2 < len(splits):
caption = " ".join(caption.split(" ")[:-1]).strip()
if caption is not None and len(caption.split(" ")) < 3:
caption = None
if caption is None and example["caption_alt_text_description"] is not None:
caption = example["caption_alt_text_description"]
if len(caption.split(" ")) < 3:
caption = None
if caption is None:
continue
caption = "".join(c for c in caption if ord(c) < 128)
if len(caption) == 0:
continue
metadata_dict = {
"caption_reference_description": example["caption_reference_description"],
"caption_attribution_description": example["caption_attribution_description"],
"caption_alt_text_description": example["caption_alt_text_description"],
"page_url": example["page_url"],
"page_title": example["page_title"],
"section_title": example["section_title"],
"hierarchical_section_title": example["hierarchical_section_title"],
"mime_type": example["mime_type"],
"original_height": example["original_height"],
"original_width": example["original_width"],
"attribution_passes_lang_id": example["attribution_passes_lang_id"],
"page_changed_recently": example["page_changed_recently"],
"context_page_description": example["context_page_description"],
"context_section_description": example["context_section_description"],
"image_url": example["image_url"],
"language": example["language"],
"is_main_image": example["is_main_image"],
}
image_downloaded, image_path = image_paths[example["image_url"]]
if image_downloaded:
buffer.append(
{
"image_path": image_path,
"text": caption,
"source": "google/wit",
"meta": json.dumps(metadata_dict, default=json_serializer, indent=2),
}
)
else:
buffer.append(
{
"image_path": None,
"text": caption,
"source": "google/wit",
"meta": json.dumps(metadata_dict, default=json_serializer, indent=2),
}
)
if len(buffer) == self.CHUNK_SIZE:
yield buffer
buffer = []
if len(buffer) > 0:
yield buffer