in dpr/data/biencoder_data.py [0:0]
def _linearize_table(self, t: dict, is_positive: bool) -> str:
rows = t["rows"]
selected_rows = set()
rows_linearized = []
total_words_len = 0
# get the first non empty row as the "header"
for i, r in enumerate(rows):
row_lin, row_len = JsonLTablesQADataset._linearize_row(r)
if len(row_lin) > 1: # TODO: change to checking cell value tokens
selected_rows.add(i)
rows_linearized.append(row_lin)
total_words_len += row_len
break
# split to chunks
if is_positive:
row_idx_with_answers = [ap[0] for ap in t["answer_pos"]]
if self.shuffle_positives:
self.rnd.shuffle(row_idx_with_answers)
for i in row_idx_with_answers:
if i not in selected_rows:
row_lin, row_len = JsonLTablesQADataset._linearize_row(rows[i])
selected_rows.add(i)
rows_linearized.append(row_lin)
total_words_len += row_len
if total_words_len >= self.max_len:
break
if total_words_len < self.max_len: # append random rows
if self.is_train_set:
rows_indexes = np.random.permutation(range(len(rows)))
else:
rows_indexes = [*range(len(rows))]
for i in rows_indexes:
if i not in selected_rows:
row_lin, row_len = JsonLTablesQADataset._linearize_row(rows[i])
if len(row_lin) > 1: # TODO: change to checking cell value tokens
selected_rows.add(i)
rows_linearized.append(row_lin)
total_words_len += row_len
if total_words_len >= self.max_len:
break
linearized_str = ""
for r in rows_linearized:
linearized_str += r + "\n"
return linearized_str