blink/candidate_retrieval/process_wiki_extractor_output.py [15:60]: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - parser = argparse.ArgumentParser() parser.add_argument( "--input", type=str, help="The full path to the file to process", required=True ) parser.add_argument( "--output", type=str, help="The full path to the output file", required=True ) args = parser.parse_args() input_file_path = args.input output_file_path = args.output if not os.path.isfile(input_file_path): print("Input file `{}` doesn't exist!".format(output_file_path)) sys.exit() if os.path.isfile(output_file_path): print("Output file `{}` already exists!".format(output_file_path)) sys.exit() xml_end_tag = "" entities_with_duplicate_titles = set() title2id = {} id_title2parsed_obj = {} num_lines = 0 with io.open(input_file_path, mode="rt", encoding="utf-8", errors="ignore") as f: for line in f: num_lines += 1 c = 0 with io.open(input_file_path, mode="rt", encoding="utf-8", errors="ignore") as f: for line in f: c += 1 if c % 1000000 == 0: print("Processed: {:.2f}%".format(c * 100 / num_lines)) if line.startswith("