in cc_net/process_wet_file.py [0:0]
def parse_doc(headers: List[str], doc: List[str]) -> Optional[dict]:
"""Headers format is:
WARC/1.0
WARC-Type: conversion
WARC-Target-URI: [url]
WARC-Date: [crawldate: 2019-02-15T19:15:59Z]
WARC-Record-ID: <urn:uuid:8865156e-d5f1-4734-9c68-4b46eaf2bb7e>
WARC-Refers-To: <urn:uuid:340152e2-65cf-4143-b522-8ce4e2d069d7>
WARC-Block-Digest: sha1:S3DTWCONT2L6ORTGCY2KXEZ37LNBB7V2
Content-Type: text/plain
Content-Length: 7743
"""
if not headers or not doc:
return None
try:
warc_type = headers[1].split()[1]
if warc_type != "conversion":
return None
url = headers[2].split()[1]
date = headers[3].split()[1]
digest = headers[6].split()[1]
length = int(headers[8].split()[1])
except Exception as e:
logger.warning("Can't parse header:", e, headers, doc)
return None
# Docs are separated by two empty lines.
last = None
if not doc[-1] and not doc[-2]:
last = -2
title, doc = doc[0], doc[1:last]
return {
"url": url,
"date_download": date,
"digest": digest,
"length": length,
"nlines": len(doc),
"source_domain": urlparse(url).netloc,
"title": title,
"raw_content": "\n".join(doc),
}