def read_nq_tables_jsonl()

in dpr/data/tables.py [0:0]


def read_nq_tables_jsonl(path: str, out_file: str = None) -> Dict[str, Table]:
    tables_with_issues = 0
    single_row_tables = 0
    nested_tables = 0
    regular_tables = 0
    total_tables = 0
    total_rows = 0
    tables_dict = {}

    with jsonlines.open(path, mode="r") as jsonl_reader:
        for jline in jsonl_reader:
            tokens = jline["tokens"]

            if "( hide ) This section has multiple issues" in " ".join(tokens):
                tables_with_issues += 1
                continue
            # if '<Table>' in tokens[1:]:
            #    nested_tables += 1

            mask = jline["html_mask"]
            page_url = jline["doc_url"]
            title = jline["title"]
            # logger.info('Table from page %s', title)
            # logger.info('tokens len %s', len(tokens))
            # logger.info('tokens %s', tokens)
            # logger.info('page_url %s', page_url)
            p = NQTableParser(tokens, mask, title)
            tables = p.parse()

            # logger.info('parsed tables %d', len(tables))

            # table = parse_table(tokens, mask)
            nested_tables += len(tables[1:])

            for t in tables:
                # logger.info('Table: %s', t)
                total_tables += 1

                # calc amount of non empty rows
                non_empty_rows = sum([1 for r in t.body if r.cells and any([True for c in r.cells if c.value_tokens])])

                if non_empty_rows <= 1:
                    single_row_tables += 1
                else:
                    regular_tables += 1
                    total_rows += len(t.body)

                    if t.get_key() not in tables_dict:
                        tables_dict[t.get_key()] = t

            if len(tables_dict) % 1000 == 0:
                logger.info("tables_dict %d", len(tables_dict))

    print("regular tables", regular_tables)
    print("tables_with_issues", tables_with_issues)
    print("single_row_tables", single_row_tables)
    print("nested_tables", nested_tables)
    if out_file:
        convert_to_csv_for_lucene(tables_dict, out_file)
    return tables_dict