in src/handler.py [0:0]
def redact(text, classification_segmenter: Segmenter, detection_segmenter: Segmenter,
redactor: Redactor, comprehend: ComprehendClient, redaction_config: RedactionConfig, language_code) -> Document:
"""
Redact pii data from given text. Logic for redacting:- .
1. Segment text into subsegments of reasonable sizes (max doc size supported by comprehend) for doing initial classification
2. For each subsegment ,
2.1 call comprehend's classify-pii-document api to determine if it contains any PII data
2.2 if it contains pii then split it to smaller chunks(e.g. <=5KB), else skip to the next subsegment
2.3 for each chunk
2.3.1 call comprehend's detect-pii-entities to extract the pii entities
2.3.2 redact the pii entities from the chunk
2.4 merge all chunks
3. merge all subsegments
"""
if REDACTION_API_ONLY:
doc = Document(text)
documents = [doc]
docs_for_entity_detection = detection_segmenter.segment(doc.text, doc.char_offset)
else:
documents = comprehend.contains_pii_entities(classification_segmenter.segment(text), language_code)
pii_docs = [doc for doc in documents if len(get_interested_pii(doc, redaction_config)) > 0]
if not pii_docs:
LOG.debug("Document doesn't have any pii. Nothing to redact.")
text = classification_segmenter.de_segment(documents).text
return Document(text, redacted_text=text)
docs_for_entity_detection = []
for pii_doc in pii_docs:
docs_for_entity_detection.extend(detection_segmenter.segment(pii_doc.text, pii_doc.char_offset))
docs_with_pii_entities = comprehend.detect_pii_documents(docs_for_entity_detection, language_code)
resultant_doc = classification_segmenter.de_segment(documents + docs_with_pii_entities)
assert len(resultant_doc.text) == len(text), "Not able to recover original document after segmentation and desegmentation."
redacted_text = redactor.redact(text, resultant_doc.pii_entities)
resultant_doc.redacted_text = redacted_text
return resultant_doc