in preprocess/htmltable.py [0:0]
def main():
parser = argparse.ArgumentParser()
parser.add_argument('-j', '--turk-json',
help="json metadata file from MTurk task")
parser.add_argument('-o', '--outfile',
help="output filename (default = stdout)")
parser.add_argument('--tsv', action='store_true',
help='also print out tsv')
parser.add_argument('--human', action='store_true',
help='also print out human-readable table')
parser.add_argument('--html', action='store_true',
help='also print out cleaned html for the table')
parser.add_argument('--keep-hidden', action='store_true',
help='keep hidden texts as is')
args = parser.parse_args()
assert not args.tsv or args.outfile.endswith('.csv')
with open(args.turk_json) as fin:
metadata = json.load(fin)
# Get the path to the HTML file
# This is kind of hacky
match = re.match(r'^(?:json|page)/(\d+)-(?:json|page)/(\d+).json$', args.turk_json)
batch_id, data_id = match.groups()
inhtml = 'page/{}-page/{}.html'.format(batch_id, data_id)
with open(inhtml, 'r', 'utf8') as fin:
raw = fin.read()
table = HtmlTable.get_wikitable(raw, metadata['tableIndex'],
normalization=HtmlTable.NORM_DUPLICATE,
remove_hidden=(not args.keep_hidden))
if args.html:
raw_table = HtmlTable.get_wikitable(raw, metadata['tableIndex'],
remove_hidden=False).table
rows = table.rows
# rows = list of columns; column = list of cells; cell = (tag, text)
# Remove redundant rows and columns
rows = remove_full_rowspans(rows)
cols = transpose(rows)
cols = remove_empty_columns(cols)
cols = merge_similar_columns(cols)
rows = anti_transpose(cols), ''
rows = merge_header_rows(rows)
# Dump
if not args.outfile:
dump_csv(rows, sys.stdout)
else:
stem = re.sub('\.csv$', '', args.outfile)
with open(args.outfile, 'w', 'utf8') as fout:
dump_csv(rows, fout)
if args.tsv:
with open(stem + '.tsv', 'w', 'utf8') as fout:
dump_tsv(rows, fout)
if args.human:
with open(stem + '.table', 'w', 'utf8') as fout:
dump_table(rows, fout)
if args.html:
with open(stem + '.html', 'w', 'utf8') as fout:
print >> fout, unicode(raw_table)