in dpr/data/biencoder_data.py [0:0]
def split_table(cls, t: dict, max_length: int):
rows = t["rows"]
header = None
header_len = 0
start_row = 0
# get the first non empty row as the "header"
for i, r in enumerate(rows):
row_lin, row_len = JsonLTablesQADataset._linearize_row(r)
if len(row_lin) > 1: # TODO: change to checking cell value tokens
header = row_lin
header_len += row_len
start_row = i
break
chunks = []
current_rows = [header]
current_len = header_len
for i in range(start_row + 1, len(rows)):
row_lin, row_len = JsonLTablesQADataset._linearize_row(rows[i])
if len(row_lin) > 1: # TODO: change to checking cell value tokens
current_rows.append(row_lin)
current_len += row_len
if current_len >= max_length:
# linearize chunk
linearized_str = "\n".join(current_rows) + "\n"
chunks.append(linearized_str)
current_rows = [header]
current_len = header_len
if len(current_rows) > 1:
linearized_str = "\n".join(current_rows) + "\n"
chunks.append(linearized_str)
return chunks