terraform/pdf-redactor/templates/workflow.yaml (106 lines of code) (raw):
# Copyright 2021 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
main:
params: [event]
steps:
- 1. Configure Execution:
steps:
- validate-inputs:
switch:
- condition: '$${not(text.match_regex(event.data.name, ".*\\.pdf"))}'
raise:
code: 400
message: "Input file not recognized as a PDF file."
- get-timestamp:
call: sys.now
result: timestamp
- configure-workflow:
assign:
- config:
working_bucket: "${working_bucket}"
output_bucket: "${output_bucket}"
input_bucket: $${event.data.bucket}
input_file: $${event.data.name}
project_id: $${sys.get_env("GOOGLE_CLOUD_PROJECT_ID")}
workflow_id: $${sys.get_env("GOOGLE_CLOUD_WORKFLOW_EXECUTION_ID")}
dpi: 92 # Higher DPI = better accuracy | Lower DPI = faster processing and smaller file sizes
dlp_template: "${dlp_template}"
include_quote_in_findings: true # Whether to include the redacted data in BQ findings or not (under quote column)
findings_labels:
timestamp: $${timestamp}
job: $${sys.get_env("GOOGLE_CLOUD_WORKFLOW_ID")}
job_id: $${sys.get_env("GOOGLE_CLOUD_WORKFLOW_EXECUTION_ID")}
document: $${"gs://" + event.data.bucket + "/" + event.data.name}
- status: {}
- 2. Split PDF into pages:
call: http.post
args:
url: "${pdf_splitter_url}"
auth:
type: OIDC
headers:
"Content-Type": "application/json"
body:
input_file: $${config.input_file}
input_file_bucket: $${config.input_bucket}
output_bucket: $${config.working_bucket}
output_folder: $${config.workflow_id}
dpi: $${config.dpi}
result: pages_to_redact
- 3. Run DLP on each page:
steps:
- define_step:
assign:
- redacted_pages: {}
- findings_files: {}
- run-dlp-on-each-page:
parallel:
shared: [redacted_pages, findings_files]
for:
value: page_to_redact
in: $${pages_to_redact.body}
index: i
steps:
- redact_page:
call: http.post
args:
url: "${dlp_runner_url}"
auth:
type: OIDC
body:
input_file: $${page_to_redact}
input_file_bucket: $${config.working_bucket}
output_file: $${text.split(page_to_redact, ".jpg")[0] + "-redacted.jpg"}
output_file_bucket: $${config.working_bucket}
dlp_template: $${config.dlp_template}
include_quote_in_findings: $${config.include_quote_in_findings}
project_id: $${config.project_id}
findings_labels: $${config.findings_labels}
result: redacted_page_result
- concatenate_output:
assign:
- redacted_pages[string(i)]: $${redacted_page_result.body.redacted_image.file}
- findings_files[string(i)]: $${redacted_page_result.body.findings.file}
# - redacted_pages_list: $${redacted_pages_list + redacted_page_result.body.redacted_image.file + ", "}
# - findings_file_list: $${findings_file_list + redacted_page_result.body.findings.file + ","}
- 4. Concatenate pages:
call: http.post
args:
url: "${pdf_merger_url}"
auth:
type: OIDC
body:
files_bucket: $${config.working_bucket}
files_to_concatenate: $${redacted_pages}
output_file: $${text.split(config.input_file, ".pdf")[0] + "-redacted.pdf"}
output_file_bucket: $${config.output_bucket}
result: redacted_pdf
- 5. Write Findings:
call: http.post
args:
url: "${findings_writer_url}"
auth:
type: OIDC
body:
findings_files: $${findings_files}
files_bucket: $${config.working_bucket}
project_id: $${config.project_id}
result: bq_output
- 6. Done!:
return: $${redacted_pdf.body}