in ocr/utils/word_to_line.py [0:0]
def sort_bbs_line_by_line(bbs, y_overlap=0.2):
'''
Function to combine word bbs into lines.
'''
line_bbs = _filter_bbs(bbs, min_size=0.0001) #Filter small word BBs
line_bbs = combine_bbs_into_lines(line_bbs, y_overlap)
line_bb_expanded = []
for line_bb in line_bbs:
line_bb_i = expand_bounding_box(line_bb, expand_bb_scale_x=0.1,
expand_bb_scale_y=0.05)
line_bb_expanded.append(line_bb_i)
line_bbs = np.array(line_bb_expanded)
# X start heuristics
# Remove lines that start more than 150% away
x_start_within_boundary = line_bbs[:, 0] < 0.5
line_bbs = line_bbs[x_start_within_boundary]
# Remove lines that start 20% away from the average
x_start_line_bbs = line_bbs[:, 0]
x_start_diff = np.abs(x_start_line_bbs - np.median(x_start_line_bbs))
x_start_remove = x_start_diff < 0.2
line_bbs = line_bbs[x_start_remove]
# X length heuristics
# Remove lines that are 50% shorter excluding the last element
if len(line_bbs) > 1:
x_length_line_bbs = line_bbs[:-1, 0] - line_bbs[:-1, 2]
x_length_diff = np.abs(x_length_line_bbs - np.median(x_length_line_bbs))
x_length_remove = x_length_diff < 0.5
last_line = line_bbs[-1]
line_bbs = line_bbs[:-1][x_length_remove]
line_bbs = np.vstack([line_bbs, last_line])
# Y height heuristics
# Split lines that are more than 1.5 of the others
y_height = line_bbs[:, 3]
y_height_diff = np.abs(y_height/np.median(y_height))
y_height_remove = y_height_diff > 1.65
new_line_bbs = []
for i in range(line_bbs.shape[0]):
if y_height_remove[i]:
# split line into 2
new_line_top = np.copy(line_bbs[i])
new_line_top[3] = new_line_top[3] / 2
new_line_bottom = np.copy(line_bbs[i])
new_line_bottom[1] = new_line_bottom[1] + new_line_bottom[3]/2
new_line_bottom[3] = new_line_bottom[3] / 2
new_line_bbs.append(new_line_top)
new_line_bbs.append(new_line_bottom)
else:
new_line_bbs.append(line_bbs[i])
line_bbs = np.vstack(new_line_bbs)
# Y consistency heuristics
# Remove lines that overlap by 40% with other lines
line_total_overlap = []
for i in range(line_bbs.shape[0]):
overlap_i = 0.0
for j in range(line_bbs.shape[0]):
if i != j:
line_i, line_j = line_bbs[i], line_bbs[j]
overlap_i += _get_rect_overlap_percentage(line_i[0], line_i[1], line_i[2], line_i[3],
line_j[0], line_j[1], line_j[2], line_j[3])
line_total_overlap.append(overlap_i)
overlap_remove = np.array(line_total_overlap) < 1
line_bbs = line_bbs[overlap_remove]
return line_bbs