in utils/import_conversations_v2.py [0:0]
def _Redact(transcript_response, project_id, impersonated_service_account):
"""Redacts a transcript response.
Args:
transcript_response: The response from transcription.
project_id: The project ID (not number) to use for redaction.
impersonated_service_account: The service account to impersonate.
Returns:
The response from transcription.
"""
dlp = google.cloud.dlp_v2.DlpServiceClient(
# project=project_id,
credentials=_GetClientCredentials(impersonated_service_account)
)
# The list of types to redact. Making this too aggressive can damage word time
# offsets. Eventually, a better solution could be determined than sending the
# entire STT response to DLP so that only the transcript parts would be
# potentially redacted.
info_types = [
'AGE',
'CREDIT_CARD_NUMBER',
'CREDIT_CARD_TRACK_NUMBER',
'DOMAIN_NAME',
'EMAIL_ADDRESS',
'FEMALE_NAME',
'MALE_NAME',
'FIRST_NAME',
'GENDER',
'GENERIC_ID',
'IP_ADDRESS',
'LAST_NAME',
'LOCATION',
'PERSON_NAME',
'PHONE_NUMBER',
'STREET_ADDRESS',
]
inspect_config = {
'info_types': [{'name': info_type} for info_type in info_types]
}
deidentify_config = {
'info_type_transformations': {
'transformations': [
{
'primitive_transformation': {
'character_mask_config': {
# Will replace PII terms with a series of '*'.
'masking_character': '*',
# Zero means no limit on characters to redact.
'number_to_mask': 0,
}
}
}
]
}
}
project_path = f'projects/{project_id}'
item = {'value': str(transcript_response)}
response = dlp.deidentify_content(
request={
'parent': project_path,
'deidentify_config': deidentify_config,
'inspect_config': inspect_config,
'item': item,
}
)
return response.item.value