def main()

in preprocess/htmltable.py [0:0]


def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-j', '--turk-json',
                        help="json metadata file from MTurk task")
    parser.add_argument('-o', '--outfile',
                        help="output filename (default = stdout)")
    parser.add_argument('--tsv', action='store_true',
                        help='also print out tsv')
    parser.add_argument('--human', action='store_true',
                        help='also print out human-readable table')
    parser.add_argument('--html', action='store_true',
                        help='also print out cleaned html for the table')
    parser.add_argument('--keep-hidden', action='store_true',
                        help='keep hidden texts as is')
    args = parser.parse_args()
    assert not args.tsv or args.outfile.endswith('.csv')

    with open(args.turk_json) as fin:
        metadata = json.load(fin)

    # Get the path to the HTML file
    # This is kind of hacky
    match = re.match(r'^(?:json|page)/(\d+)-(?:json|page)/(\d+).json$', args.turk_json)
    batch_id, data_id = match.groups()
    inhtml = 'page/{}-page/{}.html'.format(batch_id, data_id)

    with open(inhtml, 'r', 'utf8') as fin:
        raw = fin.read()
    table = HtmlTable.get_wikitable(raw, metadata['tableIndex'],
                                    normalization=HtmlTable.NORM_DUPLICATE,
                                    remove_hidden=(not args.keep_hidden))
    if args.html:
        raw_table = HtmlTable.get_wikitable(raw, metadata['tableIndex'],
                                            remove_hidden=False).table

    rows = table.rows
    # rows = list of columns; column = list of cells; cell = (tag, text)
    # Remove redundant rows and columns
    rows = remove_full_rowspans(rows)
    cols = transpose(rows)
    cols = remove_empty_columns(cols)
    cols = merge_similar_columns(cols)
    rows = anti_transpose(cols), ''
    rows = merge_header_rows(rows)
    # Dump
    if not args.outfile:
        dump_csv(rows, sys.stdout)
    else:
        stem = re.sub('\.csv$', '', args.outfile)
        with open(args.outfile, 'w', 'utf8') as fout:
            dump_csv(rows, fout)
        if args.tsv:
            with open(stem + '.tsv', 'w', 'utf8') as fout:
                dump_tsv(rows, fout)
        if args.human:
            with open(stem + '.table', 'w', 'utf8') as fout:
                dump_table(rows, fout)
        if args.html:
            with open(stem + '.html', 'w', 'utf8') as fout:
                print >> fout, unicode(raw_table)