in data.py [0:0]
def preprocess_batch(rows):
"""
Process a batch of examples represented as a dictionary where each key maps to a list of values.
For each key in NONE_KEY_MAP, create a new column with processed details.
Also, conditionally convert images to RGB if they are not already.
"""
n = len(next(iter(rows.values())))
# Prepare storage for new processed columns.
# For each key in NONE_KEY_MAP, the new key is defined as: <{original_key with spaces replaced, uppercase}>
processed_data = {}
for k in NONE_KEY_MAP:
new_key = f"<{k.replace(' ', '_').upper()}>"
processed_data[new_key] = []
# Process each example (by index)
for i in range(n):
# For each key in NONE_KEY_MAP, process the value for the i-th example.
for k in NONE_KEY_MAP:
# If the key is missing, we assume a list of Nones.
value = rows.get(k, [None] * n)[i]
if value:
if isinstance(value, list):
detail = ", ".join(value)
else:
detail = str(value)
else:
default = NONE_KEY_MAP[k][0]
detail = default.replace("no_", "unspecified ").replace("_", " ")
new_key = f"<{k.replace(' ', '_').upper()}>"
processed_data[new_key].append(detail)
# Process the image field if present.
if "image" in rows:
image = rows["image"][i]
if image is not None and hasattr(image, "mode"):
if image.mode != "RGB":
image = image.convert("RGB")
rows["image"][i] = image
# Merge the processed columns into the original batch dictionary.
rows.update(processed_data)
return rows