def finalize_doc()

in cc_net/dedup.py [0:0]


def finalize_doc(doc, field, hashes=None):
    content = doc.get(field)
    lines = content.split("\n")
    n_chars = len(content)
    if "original_nlines" not in doc:
        doc["original_nlines"] = doc.get("nlines", len(lines))
    if "original_length" not in doc:
        doc["original_length"] = doc.get("length", n_chars)
    if hashes is None:
        hashes = doc.pop(field + "_hash")

    # Remove duplicates inside doc
    seen: Set[int] = set()
    original_line_ids = doc.get("line_ids", range(len(hashes)))
    line_ids = []
    new_lines = []
    for l, line, h in zip(original_line_ids, lines, hashes):
        if h not in seen and h != 0:
            line_ids.append(l)
            new_lines.append(line)
        seen.add(h)

    doc[field] = "\n".join(new_lines)
    doc["nlines"] = len(line_ids)
    n_chars_kept = len(doc[field])
    doc["length"] = n_chars_kept
    doc["line_ids"] = line_ids
    return n_chars, n_chars_kept