in dpr/data/tables.py [0:0]
def read_nq_tables_jsonl(path: str, out_file: str = None) -> Dict[str, Table]:
tables_with_issues = 0
single_row_tables = 0
nested_tables = 0
regular_tables = 0
total_tables = 0
total_rows = 0
tables_dict = {}
with jsonlines.open(path, mode="r") as jsonl_reader:
for jline in jsonl_reader:
tokens = jline["tokens"]
if "( hide ) This section has multiple issues" in " ".join(tokens):
tables_with_issues += 1
continue
# if '<Table>' in tokens[1:]:
# nested_tables += 1
mask = jline["html_mask"]
page_url = jline["doc_url"]
title = jline["title"]
# logger.info('Table from page %s', title)
# logger.info('tokens len %s', len(tokens))
# logger.info('tokens %s', tokens)
# logger.info('page_url %s', page_url)
p = NQTableParser(tokens, mask, title)
tables = p.parse()
# logger.info('parsed tables %d', len(tables))
# table = parse_table(tokens, mask)
nested_tables += len(tables[1:])
for t in tables:
# logger.info('Table: %s', t)
total_tables += 1
# calc amount of non empty rows
non_empty_rows = sum([1 for r in t.body if r.cells and any([True for c in r.cells if c.value_tokens])])
if non_empty_rows <= 1:
single_row_tables += 1
else:
regular_tables += 1
total_rows += len(t.body)
if t.get_key() not in tables_dict:
tables_dict[t.get_key()] = t
if len(tables_dict) % 1000 == 0:
logger.info("tables_dict %d", len(tables_dict))
print("regular tables", regular_tables)
print("tables_with_issues", tables_with_issues)
print("single_row_tables", single_row_tables)
print("nested_tables", nested_tables)
if out_file:
convert_to_csv_for_lucene(tables_dict, out_file)
return tables_dict