extraction/compress_wikidata_msgpack.py (45 lines of code) (raw):

""" Compress a jsonl version of Wikidata by throwing about descriptions and converting file to msgpack format. Usage ----- ``` python3 compress_wikidata_msgpack.py wikidata.json wikidata.msgpack ``` """ import argparse import msgpack from wikidata_linker_utils.wikidata_iterator import open_wikidata_file from wikidata_linker_utils.progressbar import get_progress_bar def parse_args(args=None): parser = argparse.ArgumentParser() parser.add_argument('wikidata') parser.add_argument('out') return parser.parse_args(args=args) def main(): args = parse_args() approx_max_quantity = 24642416 pbar = get_progress_bar('compress wikidata', max_value=approx_max_quantity, item='entities') pbar.start() seen = 0 with open(args.out, "wb") as fout: for doc in open_wikidata_file(args.wikidata, 1000): seen += 1 if 'descriptions' in doc: del doc['descriptions'] if 'labels' in doc: del doc['labels'] if 'aliases' in doc: del doc['aliases'] for claims in doc['claims'].values(): for claim in claims: if 'id' in claim: del claim['id'] if 'rank' in claim: del claim['rank'] if 'references' in claim: for ref in claim['references']: if 'hash' in ref: del ref['hash'] if 'qualifiers' in claim: for qualifier in claim['qualifiers'].values(): if 'hash' in qualifier: del qualifier['hash'] fout.write(msgpack.packb(doc)) if seen % 1000 == 0: if seen < approx_max_quantity: pbar.update(seen) pbar.finish() if __name__ == "__main__": main()