in infra-as-code/modules/ingest-pipeline/cf-audio-redaction/audio_redaction.py [0:0]
def get_redaction_intervals(self, json_file):
"""Extracts the redaction intervals from the JSON file."""
redaction_intervals = []
word_list = [word_item['word'] for word_item in json_file['words']]
for dlp_entry in json_file['dlp']:
phrase = dlp_entry.split(' ')
if len(phrase) >= 2:
for index, word in enumerate(phrase):
for i, element in enumerate(json_file['words']):
if i < len(json_file['words']) - 1 and index < len(phrase) - 1:
if word == element['word']:
if word_list[i:i + len(phrase)] == phrase:
element['index'] = i
found_words = json_file['words'][i:i + len(phrase)]
redaction_intervals.extend(found_words)
else:
for index, word in enumerate(phrase):
for i, element in enumerate(json_file['words']):
if word == element['word']:
element['index'] = i
redaction_intervals.append(element)
return redaction_intervals