in preprocessing/preprocess_i2b2_2014_ner.py [0:0]
def merge_into_words(text_by_char, all_labels_by_char):
assert len(text_by_char) == len(all_labels_by_char), "Incorrect # of sentences!"
N = len(text_by_char)
text_by_word, all_labels_by_word = [], []
for sentence_num in range(N):
sentence_by_char = text_by_char[sentence_num]
labels_by_char = all_labels_by_char[sentence_num]
assert len(sentence_by_char) == len(labels_by_char), "Incorrect # of chars in sentence!"
S = len(sentence_by_char)
if labels_by_char == (['O'] * len(sentence_by_char)):
sentence_by_word = ''.join(sentence_by_char).split()
labels_by_word = ['O'] * len(sentence_by_word)
else:
sentence_by_word, labels_by_word = [], []
text_chunks, labels_chunks = [], []
s = 0
for i in range(S):
if i == S - 1:
text_chunks.append(sentence_by_char[s:])
labels_chunks.append(labels_by_char[s:])
elif labels_by_char[i] == 'O':
continue
else:
if i > 0 and labels_by_char[i - 1] == 'O':
text_chunks.append(sentence_by_char[s:i])
labels_chunks.append(labels_by_char[s:i])
s = i
if labels_by_char[i + 1] == 'O' or labels_by_char[i + 1][2:] != labels_by_char[i][2:]:
text_chunks.append(sentence_by_char[s:i + 1])
labels_chunks.append(labels_by_char[s:i + 1])
s = i + 1
for text_chunk, labels_chunk in zip(text_chunks, labels_chunks):
assert len(text_chunk) == len(labels_chunk), "Bad Chunking (len)"
assert len(text_chunk) > 0, "Bad chunking (len 0)" + str(text_chunks) + str(labels_chunks)
labels_set = set(labels_chunk)
assert labels_set == set(['O']) or (len(labels_set) <= 3 and 'O' not in labels_set), (
("Bad chunking (contents) %s" % ', '.join(labels_set)) + str(text_chunks) + str(labels_chunks)
)
text_chunk_by_word = ''.join(text_chunk).split()
W = len(text_chunk_by_word)
if W == 0:
continue
if labels_chunk[0] == 'O':
labels_chunk_by_word = ['O'] * W
elif W == 1:
labels_chunk_by_word = [labels_chunk[0]]
elif W == 2:
labels_chunk_by_word = [labels_chunk[0], labels_chunk[-1]]
else:
labels_chunk_by_word = [
labels_chunk[0]
] + [labels_chunk[1]] * (W - 2) + [
labels_chunk[-1]
]
sentence_by_word.extend(text_chunk_by_word)
labels_by_word.extend(labels_chunk_by_word)
assert len(sentence_by_word) == len(labels_by_word), "Incorrect # of words in sentence!"
if len(sentence_by_word) == 0: continue
text_by_word.append(sentence_by_word)
all_labels_by_word.append(labels_by_word)
return text_by_word, all_labels_by_word