in tensorflow_datasets/text/c4_utils.py [0:0]
def split_wet_file(wet_file_path, counter_inc_fn=None):
"""Split a WET file into separate pages."""
logging.info("Splitting file: %s", wet_file_path)
if not counter_inc_fn:
counter_inc_fn = get_counter_inc_fn("split-wet-file")
counter_inc_fn("wet-file")
with tf.io.gfile.GFile(wet_file_path,
"rb") as f, gzip.GzipFile(fileobj=f) as g:
url = None
content = None
content_len = None
content_type = None
timestamp = None
def _maybe_get_page():
"""Generate a (url, {features}) page."""
if not url and url is not None:
counter_inc_fn("filtered:no_url")
if not content and content is not None:
counter_inc_fn("filtered:no_content")
if not content_type and content_type is not None:
counter_inc_fn("filtered:no_content_type")
if not content_len and content_len is not None:
counter_inc_fn("filtered:no_content_len")
if not timestamp and timestamp is not None:
counter_inc_fn("filtered:no_timestamp")
if content and url:
counter_inc_fn("passed")
return (url, {
"text": "\n".join(content),
"content-type": content_type,
"content-length": content_len,
"timestamp": timestamp,
"url": url
})
return None
for line in io.TextIOWrapper(g, encoding="utf-8"): # pytype: disable=wrong-arg-types
line = line.strip()
if not line:
continue
if line == _PAGE_DELIMITER:
page = _maybe_get_page()
if page:
yield page
url = ""
content = []
content_len = ""
content_type = ""
timestamp = ""
if line.startswith(_URL_KEY):
url = line[len(_URL_KEY):].strip()
if line.startswith(_URL_DATE):
timestamp = line[len(_URL_DATE):].strip()
if line.startswith(_CONTENT_TYPE):
content_type = line[len(_CONTENT_TYPE):].strip()
if line.startswith(_CONTENT_LEN):
content_len = line[len(_CONTENT_LEN):].strip()
if line.startswith(_METADATA_PREFIXES):
continue
content.append(line) # pytype: disable=attribute-error
page = _maybe_get_page()
if page:
yield page