def sort_bbs_line_by_line()

in ocr/utils/word_to_line.py [0:0]


def sort_bbs_line_by_line(bbs, y_overlap=0.2):
    '''
    Function to combine word bbs into lines.
    '''
    line_bbs = _filter_bbs(bbs, min_size=0.0001) #Filter small word BBs
    line_bbs = combine_bbs_into_lines(line_bbs, y_overlap)
    line_bb_expanded = []
    for line_bb in line_bbs:
        line_bb_i = expand_bounding_box(line_bb, expand_bb_scale_x=0.1,
                                              expand_bb_scale_y=0.05)
        line_bb_expanded.append(line_bb_i)
    line_bbs = np.array(line_bb_expanded)
    
    # X start heuristics
    # Remove lines that start more than 150% away
    x_start_within_boundary = line_bbs[:, 0] < 0.5
    line_bbs = line_bbs[x_start_within_boundary]
    
    # Remove lines that start 20% away from the average
    x_start_line_bbs = line_bbs[:, 0]
    x_start_diff = np.abs(x_start_line_bbs - np.median(x_start_line_bbs))
    x_start_remove = x_start_diff < 0.2
    line_bbs = line_bbs[x_start_remove]
    
    # X length heuristics
    # Remove lines that are 50% shorter excluding the last element
    if len(line_bbs) > 1:
        x_length_line_bbs = line_bbs[:-1, 0] - line_bbs[:-1, 2]
        x_length_diff = np.abs(x_length_line_bbs - np.median(x_length_line_bbs))    
        x_length_remove = x_length_diff < 0.5
        last_line = line_bbs[-1]
        line_bbs = line_bbs[:-1][x_length_remove]
        line_bbs = np.vstack([line_bbs, last_line])
    
    # Y height heuristics
    # Split lines that are more than 1.5 of the others
    y_height = line_bbs[:, 3]
    y_height_diff = np.abs(y_height/np.median(y_height))
    y_height_remove = y_height_diff > 1.65
    
    new_line_bbs = []
    for i in range(line_bbs.shape[0]):
        if y_height_remove[i]:
            # split line into 2
            new_line_top = np.copy(line_bbs[i])
            new_line_top[3] = new_line_top[3] / 2
            
            new_line_bottom = np.copy(line_bbs[i])
            new_line_bottom[1] = new_line_bottom[1] + new_line_bottom[3]/2
            new_line_bottom[3] = new_line_bottom[3] / 2
                        
            new_line_bbs.append(new_line_top)
            new_line_bbs.append(new_line_bottom)
        else:
            new_line_bbs.append(line_bbs[i])
    line_bbs = np.vstack(new_line_bbs)
    
    # Y consistency heuristics
    # Remove lines that overlap by 40% with other lines
    line_total_overlap = []
    for i in range(line_bbs.shape[0]):
        overlap_i = 0.0
        for j in range(line_bbs.shape[0]):
            if i != j:
                line_i, line_j = line_bbs[i], line_bbs[j]
                overlap_i += _get_rect_overlap_percentage(line_i[0], line_i[1], line_i[2], line_i[3],
                                                          line_j[0], line_j[1], line_j[2], line_j[3])
        line_total_overlap.append(overlap_i)
    overlap_remove = np.array(line_total_overlap) < 1
    line_bbs = line_bbs[overlap_remove]
    return line_bbs