void consumeIgnoredTag()

in inference/src/translator/html.cpp [247:317]


void consumeIgnoredTag(markup::Scanner &scanner, HTML::Tag &tag, std::string const &name) {
  // Only full elements can be consumed this way. With void tags we don't know
  // where to stop scanning. All other types cannot be nested anyway.
  assert(tag.type == HTML::Tag::ELEMENT);

  // TT_TAG_START is already consumed.
  markup::Scanner::TokenType token;
  size_t inside = 0;

  // Consume the full open tag, i.e. all its attributes
  while (!inside) {
    token = scanner.next();
    switch (token) {
      case markup::Scanner::TT_ERROR:
        ABORT("HTML parse error");
      case markup::Scanner::TT_EOF:
        ABORT("Did not find closing tag </{}>", name);
      case markup::Scanner::TT_ATTRIBUTE:
        tag.attributes += format(" {}=\"{}\"", scanner.attribute(), scanner.value());
        break;
      default:
        // Not an attribute! Must be something inside the body or the closing
        // tag already. Time to jump to the next loop.
        ++inside;
        break;
    }
  }

  // Last token was something that would have triggered Scanner::scanBody(),
  // which sets value() to start pointing at the body.
  const char *start = scanner.start();

  // Consume the rest of the HTML until (including) the final closing tag. We
  // start with the token that caused the previous loop to fall into the default
  // case.
  while (inside) {
    switch (token) {
      case markup::Scanner::TT_ERROR:
        ABORT("HTML parse error");
      case markup::Scanner::TT_EOF:
        ABORT("Did not find closing tag </{}>");
      case markup::Scanner::TT_TAG_START:
        // Note: Looking specifically for only our own type of tag so we don't
        // have to care about whether other tags we encounter are void tags or
        // not. Does assume the HTML is valid, as no stack is kept.
        if (toLowerCase(scanner.tag()) == name) ++inside;
        break;
      case markup::Scanner::TT_TAG_END:
        if (toLowerCase(scanner.tag()) == name) --inside;
        break;
      default:
        break;
    }

    // Only continue scanning if we're still inside. We could have just read the
    // TT_TAG_END token that ended this element, and we don't want to continue
    // consuming tokens at that point.
    if (inside) token = scanner.next();
  }

  // Only a TAG_END could have stopped the previous loop. We take the start
  // of the final closing tag as the end of our data.
  assert(token == markup::Scanner::TT_TAG_END);
  const char *end = scanner.start();

  // All data between the end of the first open element, and the start of the
  // last close element, we just treat as raw data that will be printed when
  // this tag is eventually printed.
  assert(end >= start);
  tag.data = std::string_view(start, end - start);
}