in annotation/fn-SMGT-Post/main.py [0:0]
def handler(event, context):
consolidated_labels = []
parsed_url = urlparse(event["payload"]["s3Uri"])
logger.info("Consolidating labels from %s", event["payload"]["s3Uri"])
textFile = s3.get_object(Bucket=parsed_url.netloc, Key=parsed_url.path[1:])
filecont = textFile["Body"].read()
annotations = json.loads(filecont)
for dataset in annotations:
dataset_worker_anns = []
consolidated_label = {
"workerAnnotations": dataset_worker_anns,
}
dataset_warnings = []
label = {
"datasetObjectId": dataset["datasetObjectId"],
"consolidatedAnnotation": {
"content": {
event["labelAttributeName"]: consolidated_label,
},
},
}
for annotation in dataset["annotations"]:
ann_raw = json.loads(annotation["annotationData"]["content"])
ann_data = json.loads(annotation["annotationData"]["content"]) # (Deep clone of raw)
ann_data["workerId"] = annotation["workerId"]
# Find the unique OCR annotation IDs:
ann_ocr_ids = set(
map(
lambda m: m.group(1),
filter(
lambda m: m,
map(
lambda key: re.match(r"ocr-(.*)-[a-z]+", key, flags=re.IGNORECASE),
ann_raw.keys(),
),
),
),
)
# Normalize the OCR labels for this annotation:
ocr_ann_data = []
ann_data["ocrAnnotations"] = ocr_ann_data
for ocr_id in ann_ocr_ids:
meta_field_key = f"ocr-{ocr_id}-meta"
if meta_field_key in ann_data:
ocr_datum = json.loads(ann_data[meta_field_key])
del ann_data[meta_field_key]
else:
ocr_datum = {}
ocr_datum["annotationId"] = ocr_id
# Consolidate the field's status from (potentially missing/inconsistent) radios:
ocr_statuses = ("correct", "unclear", "wrong")
ocr_status_fields = [f"ocr-{ocr_id}-{s}" for s in ocr_statuses]
unknown_statuses = [
s for ix, s in enumerate(ocr_statuses) if ocr_status_fields[ix] not in ann_data
]
selected_statuses = [
s
for ix, s in enumerate(ocr_statuses)
if ann_data.get(ocr_status_fields[ix], {}).get("on")
]
if len(selected_statuses) >= 1:
ocr_datum["status"] = selected_statuses[0]
else:
dataset_warnings.append(
f"Missing correct/unclear/wrong status for OCR field {ocr_id}",
)
if len(selected_statuses) > 1:
dataset_warnings.append(
"OCR field {} tagged to multiple statuses {}: Taking first value".format(
ocr_id,
selected_statuses,
)
)
if len(unknown_statuses):
dataset_warnings.append(
"".join(
"Could not determine whether the following statuses were selected ",
"for OCR field {}: {}",
).format(
ocr_id,
unknown_statuses,
)
)
for key in ocr_status_fields:
if key in ann_data:
del ann_data[key]
# Load in the correction text, if provided:
correction_field_key = f"ocr-{ocr_id}-override"
if correction_field_key in ann_data:
# Ignore correction if 'wrong' was not selected:
if "wrong" in selected_statuses:
ocr_datum["correction"] = ann_data[correction_field_key]
# Tidy up the raw field regardless:
del ann_data[correction_field_key]
ocr_ann_data.append(ocr_datum)
dataset_worker_anns.append(ann_data)
if len(dataset_warnings):
consolidated_label["consolidationWarnings"] = dataset_warnings
if len(dataset_worker_anns):
# Take first annotation as 'consolidated' value:
for key in dataset_worker_anns[0]:
if key not in consolidated_label:
consolidated_label[key] = dataset_worker_anns[0][key]
consolidated_labels.append(label)
return consolidated_labels