in src/dlp-runner/main.py [0:0]
def process_image(input_file_bucket, input_file, output_file_bucket,
output_file, project, inspect_template, include_quotes,
labels):
# declare names for temporary files
name, ext = os.path.splitext(os.path.basename(input_file))
hash = str(uuid.uuid4())
tmp_file = f"{hash}{ext}"
tmp_file_redacted = f"{hash}-redacted{ext}"
# download file from bucket
print(f"Downloading input file from gs://{input_file_bucket}/{input_file}")
input_bucket_client = storage_client.get_bucket(input_file_bucket)
blob_pdf = input_bucket_client.get_blob(input_file)
blob_pdf.download_to_filename(tmp_file)
print(f"Input file downloaded from GCS to {tmp_file}")
# redact file using DLP
findings = redact_image(project, tmp_file, tmp_file_redacted,
inspect_template, include_quotes)
print(f"Redacted image saved to file {tmp_file_redacted}")
# upload redacted image to bucket
output_bucket_client = storage_client.get_bucket(output_file_bucket)
out_blob = output_bucket_client.blob(output_file)
out_blob.upload_from_filename(tmp_file_redacted)
print(
f"Redacted image uploaded to gs://{output_file_bucket}/{output_file}")
for f in findings:
# Create time is not properly parsed to match BQ table, so we need to pass from a string
# datetime into BQ's structure for create_time (create_time.seconds and create_time.nanos)
# Workaround: The [0:19] was added to trim the string to the first 20 characters,
# this cuts off the milliseconds, as the API omits 000 when at the round second.
create_time = datetime.datetime.strptime(f["create_time"][0:19],
'%Y-%m-%dT%H:%M:%S')
f["create_time"] = {"seconds": create_time.strftime('%s'), "nanos": 0}
f["location"]["container"] = {
"project_id": project,
"full_path": f"gs://{input_file_bucket}/{input_file}"
}
if labels and len(labels) > 0:
f["labels"] = []
for key in labels:
f["labels"].append({"key": key, "value": labels[key]})
else:
f.pop("labels")
# upload findings to cloud storage
findings_file = output_file.replace(ext, ".json")
out_blob = output_bucket_client.blob(findings_file)
out_blob.upload_from_string(data=json.dumps(findings),
content_type='application/json')
print(
f"Redaction metadata successfully uploaded to gs://{output_file_bucket}/{findings_file}"
)
# Cleanup local files
os.remove(tmp_file)
os.remove(tmp_file_redacted)
return {
"redacted_image": {
"bucket": output_file_bucket,
"file": output_file
},
"findings": {
"bucket": output_file_bucket,
"file": findings_file
}
}