in cc_net/dedup.py [0:0]
def finalize_doc(doc, field, hashes=None):
content = doc.get(field)
lines = content.split("\n")
n_chars = len(content)
if "original_nlines" not in doc:
doc["original_nlines"] = doc.get("nlines", len(lines))
if "original_length" not in doc:
doc["original_length"] = doc.get("length", n_chars)
if hashes is None:
hashes = doc.pop(field + "_hash")
# Remove duplicates inside doc
seen: Set[int] = set()
original_line_ids = doc.get("line_ids", range(len(hashes)))
line_ids = []
new_lines = []
for l, line, h in zip(original_line_ids, lines, hashes):
if h not in seen and h != 0:
line_ids.append(l)
new_lines.append(line)
seen.add(h)
doc[field] = "\n".join(new_lines)
doc["nlines"] = len(line_ids)
n_chars_kept = len(doc[field])
doc["length"] = n_chars_kept
doc["line_ids"] = line_ids
return n_chars, n_chars_kept