in src/processors.py [0:0]
def de_segment(self, segments: List[Document]) -> Document:
"""
Merge the segments back into one big text. It also merges back the pii classification result.
Handles conflicting result on overlapping text between two text segments in the following ways:
1. For pii classification, the maximum thresholds for an entity amongst the segments is
updated as the threshold for that entity for the merged document
2. For pii entity annotations, for a conflicting annotation span a higher priority
is given to the one with a higher confidence threshold
"""
merged_text = ""
pii_classification = {}
pii_entities = []
segments.sort(key=lambda x: x.char_offset)
for segment in segments:
offset_adjusted_segment = Document(text=segment.text, char_offset=segment.char_offset,
pii_entities=self._relocate_annotation(segment.pii_entities, segment.char_offset),
pii_classification=segment.pii_classification)
self._merge_classifcation_results(segment, pii_classification)
self._merge_pii_annotation_results(offset_adjusted_segment, pii_entities)
merged_text = merged_text + segment.text[len(merged_text) - segment.char_offset:]
return Document(text=merged_text, char_offset=0, pii_classification=pii_classification, pii_entities=pii_entities)