in utils.py [0:0]
def augment_sentence(tokens: List[str], augmentations: List[Tuple[List[tuple], int, int]], begin_entity_token: str,
sep_token: str, relation_sep_token: str, end_entity_token: str) -> str:
"""
Augment a sentence by adding tags in the specified positions.
Args:
tokens: Tokens of the sentence to augment.
augmentations: List of tuples (tags, start, end).
begin_entity_token: Beginning token for an entity, e.g. '['
sep_token: Separator token, e.g. '|'
relation_sep_token: Separator token for relations, e.g. '='
end_entity_token: End token for an entity e.g. ']'
An example follows.
tokens:
['Tolkien', 'was', 'born', 'here']
augmentations:
[
([('person',), ('born in', 'here')], 0, 1),
([('location',)], 3, 4),
]
output augmented sentence:
[ Tolkien | person | born in = here ] was born [ here | location ]
"""
# sort entities by start position, longer entities first
augmentations = list(sorted(augmentations, key=lambda z: (z[1], -z[2])))
# check that the entities have a tree structure (if two entities overlap, then one is contained in
# the other), and build the entity tree
root = -1 # each node is represented by its position in the list of augmentations, except that the root is -1
entity_tree = {root: []} # list of children of each node
current_stack = [root] # where we are in the tree
for j, x in enumerate(augmentations):
tags, start, end = x
if any(augmentations[k][1] < start < augmentations[k][2] < end for k in current_stack):
# tree structure is not satisfied!
logging.warning(f'Tree structure is not satisfied! Dropping annotation {x}')
continue
while current_stack[-1] >= 0 and \
not (augmentations[current_stack[-1]][1] <= start <= end <= augmentations[current_stack[-1]][2]):
current_stack.pop()
# add as a child of its father
entity_tree[current_stack[-1]].append(j)
# update stack
current_stack.append(j)
# create empty list of children for this new node
entity_tree[j] = []
return ' '.join(expand_tokens(
tokens, augmentations, entity_tree, root, begin_entity_token, sep_token, relation_sep_token, end_entity_token
))