def split_wet_file()

in tensorflow_datasets/text/c4_utils.py [0:0]


def split_wet_file(wet_file_path, counter_inc_fn=None):
  """Split a WET file into separate pages."""
  logging.info("Splitting file: %s", wet_file_path)
  if not counter_inc_fn:
    counter_inc_fn = get_counter_inc_fn("split-wet-file")
  counter_inc_fn("wet-file")

  with tf.io.gfile.GFile(wet_file_path,
                         "rb") as f, gzip.GzipFile(fileobj=f) as g:
    url = None
    content = None
    content_len = None
    content_type = None
    timestamp = None

    def _maybe_get_page():
      """Generate a (url, {features}) page."""
      if not url and url is not None:
        counter_inc_fn("filtered:no_url")
      if not content and content is not None:
        counter_inc_fn("filtered:no_content")
      if not content_type and content_type is not None:
        counter_inc_fn("filtered:no_content_type")
      if not content_len and content_len is not None:
        counter_inc_fn("filtered:no_content_len")
      if not timestamp and timestamp is not None:
        counter_inc_fn("filtered:no_timestamp")
      if content and url:
        counter_inc_fn("passed")
        return (url, {
            "text": "\n".join(content),
            "content-type": content_type,
            "content-length": content_len,
            "timestamp": timestamp,
            "url": url
        })
      return None

    for line in io.TextIOWrapper(g, encoding="utf-8"):  # pytype: disable=wrong-arg-types
      line = line.strip()
      if not line:
        continue
      if line == _PAGE_DELIMITER:
        page = _maybe_get_page()
        if page:
          yield page
        url = ""
        content = []
        content_len = ""
        content_type = ""
        timestamp = ""

      if line.startswith(_URL_KEY):
        url = line[len(_URL_KEY):].strip()

      if line.startswith(_URL_DATE):
        timestamp = line[len(_URL_DATE):].strip()

      if line.startswith(_CONTENT_TYPE):
        content_type = line[len(_CONTENT_TYPE):].strip()

      if line.startswith(_CONTENT_LEN):
        content_len = line[len(_CONTENT_LEN):].strip()

      if line.startswith(_METADATA_PREFIXES):
        continue

      content.append(line)  # pytype: disable=attribute-error

    page = _maybe_get_page()
    if page:
      yield page