blink/candidate_retrieval/process_wiki_extractor_output.py [15:60]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
parser = argparse.ArgumentParser()

parser.add_argument(
    "--input", type=str, help="The full path to the file to process", required=True
)
parser.add_argument(
    "--output", type=str, help="The full path to the output file", required=True
)

args = parser.parse_args()

input_file_path = args.input
output_file_path = args.output

if not os.path.isfile(input_file_path):
    print("Input file `{}` doesn't exist!".format(output_file_path))
    sys.exit()

if os.path.isfile(output_file_path):
    print("Output file `{}` already exists!".format(output_file_path))
    sys.exit()

xml_end_tag = "</doc>"

entities_with_duplicate_titles = set()
title2id = {}

id_title2parsed_obj = {}

num_lines = 0
with io.open(input_file_path, mode="rt", encoding="utf-8", errors="ignore") as f:
    for line in f:
        num_lines += 1

c = 0

with io.open(input_file_path, mode="rt", encoding="utf-8", errors="ignore") as f:
    for line in f:
        c += 1

        if c % 1000000 == 0:
            print("Processed: {:.2f}%".format(c * 100 / num_lines))

        if line.startswith("<doc id="):
            doc_xml = ET.fromstring("{}{}".format(line, xml_end_tag))
            doc_attr = doc_xml.attrib
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



blink/candidate_retrieval/process_wiki_extractor_output_full.py [15:61]:
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
parser = argparse.ArgumentParser()

parser.add_argument(
    "--input", type=str, help="The full path to the file to process", required=True
)
parser.add_argument(
    "--output", type=str, help="The full path to the output file", required=True
)

args = parser.parse_args()


input_file_path = args.input
output_file_path = args.output

if not os.path.isfile(input_file_path):
    print("Input file `{}` doesn't exist!".format(output_file_path))
    sys.exit()

if os.path.isfile(output_file_path):
    print("Output file `{}` already exists!".format(output_file_path))
    sys.exit()

xml_end_tag = "</doc>"

entities_with_duplicate_titles = set()
title2id = {}

id_title2parsed_obj = {}

num_lines = 0
with io.open(input_file_path, mode="rt", encoding="utf-8", errors="ignore") as f:
    for line in f:
        num_lines += 1

c = 0

with io.open(input_file_path, mode="rt", encoding="utf-8", errors="ignore") as f:
    for line in f:
        c += 1

        if c % 1000000 == 0:
            print("Processed: {:.2f}%".format(c * 100 / num_lines))

        if line.startswith("<doc id="):
            doc_xml = ET.fromstring("{}{}".format(line, xml_end_tag))
            doc_attr = doc_xml.attrib
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -



