void JSLexer::scanTemplateLiteral()

in lib/Parser/JSLexer.cpp [1884:2137]
184 lines of code
52 McCabe index (conditional complexity)

void JSLexer::scanTemplateLiteral() {
  assert(*curCharPtr_ == '`' || *curCharPtr_ == '}');

  // Whether the token will result in TemplateHead upon encountering ${.
  // If we end the literal with `, then the result is NoSubstitutionTemplate,
  // so this will be ignored.
  bool isHead = *curCharPtr_ == '`';

  // If the token ended with a ` then it's a tail (or NoSubstitutionTemplate),
  // and if it ended with a ${ then it's not a tail.
  bool isTail = false;

  // Advance past the initial `.
  ++curCharPtr_;

  // Track whether we encounter any NotEscapeSequence instances,
  // which will be used to error out on non-tagged sequences.
  bool foundNotEscapeSequence = false;

  // Store the Template Value (TV) in the tmpStorage_.
  tmpStorage_.clear();

  // Store the Template Raw Value (TRV) in the rawStorage_.
  rawStorage_.clear();

  /// Return the Template Raw Value (TRV) of character \p c.
  /// The only time the TRV is different from c is when c is a <CR>.
  /// In that case, this function will return 0x0a (LINE FEED).
  const auto trv = [](char c) -> char {
    if (c == '\r') {
      // This case takes \r and \r\n into account.
      // The code below which consumes line separators will skip the following
      // \n if there is a \r\n.
      // For the purposes of finding the TRV it doesn't matter.
      return 0x0a;
    }
    return c;
  };

  for (;;) {
    if (*curCharPtr_ == '`') {
      isTail = true;
      ++curCharPtr_;
      break;
    } else if (*curCharPtr_ == '$' && curCharPtr_[1] == '{') {
      // End of the TemplateCharacters.
      isTail = false;
      curCharPtr_ += 2;
      break;
    } else if (*curCharPtr_ == '\\') {
      rawStorage_.push_back(*curCharPtr_);
      ++curCharPtr_;
      rawStorage_.push_back(trv(*curCharPtr_));
      switch ((unsigned char)*curCharPtr_) {
        case '\'':
        case '"':
        case '\\':
          tmpStorage_.push_back((unsigned char)*curCharPtr_++);
          break;

        case 'b':
          ++curCharPtr_;
          tmpStorage_.push_back(8);
          break;
        case 'f':
          ++curCharPtr_;
          tmpStorage_.push_back(12);
          break;
        case 'n':
          ++curCharPtr_;
          tmpStorage_.push_back(10);
          break;
        case 'r':
          ++curCharPtr_;
          tmpStorage_.push_back(13);
          break;
        case 't':
          ++curCharPtr_;
          tmpStorage_.push_back(9);
          break;
        case 'v':
          ++curCharPtr_;
          tmpStorage_.push_back(11);
          break;

        case '\0': // EOF?
          if (curCharPtr_ == bufferEnd_) { // eof?
            error(
                SMLoc::getFromPointer(curCharPtr_),
                "non-terminated template literal");
            sm_.note(token_.getStartLoc(), "template literal started here");
            goto breakLoop;
          } else {
            tmpStorage_.push_back((unsigned char)*curCharPtr_++);
          }
          break;

        case '0':
          // '\0' is only a valid escape sequence if not followed by a
          // DecimalDigit.
          if (!(curCharPtr_[1] >= '0' && curCharPtr_[1] <= '9')) {
            ++curCharPtr_;
            appendUnicodeToStorage(0);
            break;
          }
          // fall-through

        case '1':
        case '2':
        case '3':
        case '4':
        case '5':
        case '6':
        case '7':
        case '8':
        case '9':
          // NotEscapeSequence :: DecimalDigit but not 0
          // NotEscapeSequence :: 0 DecimalDigit
          // Octal numbers are not supported in template strings,
          // so leave the number in the raw storage (done above) and move on.
          ++curCharPtr_;
          foundNotEscapeSequence = true;
          break;

        case 'x': {
          ++curCharPtr_;
          const char *start = curCharPtr_;
          auto v = consumeHex(2, false);
          if (!v) {
            foundNotEscapeSequence = true;
          }
          appendUnicodeToStorage(v ? *v : 0);
          rawStorage_.append({start, (size_t)(curCharPtr_ - start)});
          break;
        }

        case 'u': {
          // Pointer to the first character after the 'u', which is where we
          // can continue scanning from if we fail to decode an escape.
          const char *start = curCharPtr_ + 1;
          // Reset the pointer to the '\' to scan the unicode escape.
          --curCharPtr_;
          assert(*curCharPtr_ == '\\' && "must have started with \\");
          auto codepoint = consumeUnicodeEscapeOptional();
          if (!codepoint) {
            foundNotEscapeSequence = true;
            curCharPtr_ = start;
            break;
          }
          appendUnicodeToStorage(*codepoint);
          rawStorage_.append({start, (size_t)(curCharPtr_ - start)});
          break;
        }

        // Escaped line terminator. We just need to skip it, because it was
        // added to the raw storage at the start of the switch statement.
        case '\n':
          ++curCharPtr_;
          break;
        case '\r':
          ++curCharPtr_;
          if (*curCharPtr_ == '\n') // skip CR LF
            ++curCharPtr_;
          break;
        case UTF8_LINE_TERMINATOR_CHAR0: {
          bool isLineTerminator =
              matchUnicodeLineTerminatorOffset1(curCharPtr_);
          uint32_t codepoint = _decodeUTF8SlowPath(curCharPtr_);
          // Needs to be added to the rawStorage_ regardless,
          // but we first need to pop off the byte that was added prior to the
          // switch statement.
          rawStorage_.pop_back();
          appendUnicodeToStorage(codepoint, rawStorage_);
          if (!isLineTerminator) {
            // Only add the codepoint to the tmpStorage if it wasn't a line
            // terminator.
            appendUnicodeToStorage(codepoint);
          }
          break;
        }

        default:
          if (LLVM_UNLIKELY(isUTF8Start(*curCharPtr_))) {
            uint32_t codepoint = _decodeUTF8SlowPath(curCharPtr_);
            appendUnicodeToStorage(codepoint);
            // Remove the last byte from rawStorage_ and then append the
            // unicode codepoint to it. The already inserted byte will change
            // if this codepoint is in Supplementary Planes.
            rawStorage_.pop_back();
            appendUnicodeToStorage(codepoint, rawStorage_);
          } else {
            // The TV of EscapeSequence is the SV of EscapeSequence.
            tmpStorage_.push_back((unsigned char)*curCharPtr_++);
          }
          break;
      }
    } else if (LLVM_UNLIKELY(*curCharPtr_ == 0 && curCharPtr_ == bufferEnd_)) {
      error(
          SMLoc::getFromPointer(curCharPtr_),
          "non-terminated template literal");
      sm_.note(token_.getStartLoc(), "template literal started here");
      break;
    } else if (*curCharPtr_ == '\r') {
      // The TV of LineTerminatorSequence is the TRV of
      // LineTerminatorSequence. The only time this differs from the same
      // characters as the bytes in the file is when the sequence begins with
      // a <CR>.
      tmpStorage_.push_back(trv(*curCharPtr_));
      rawStorage_.push_back(trv(*curCharPtr_));
      curCharPtr_++;
      if (*curCharPtr_ == '\n') {
        // Skip the <CR> <LF>
        curCharPtr_++;
      }
    } else {
      if (LLVM_UNLIKELY(isUTF8Start(*curCharPtr_))) {
        // Decode and re-encode the character and append it to the string
        // storage
        uint32_t codepoint = _decodeUTF8SlowPath(curCharPtr_);
        appendUnicodeToStorage(codepoint);
        appendUnicodeToStorage(codepoint, rawStorage_);
      } else {
        rawStorage_.push_back(*curCharPtr_);
        tmpStorage_.push_back(*curCharPtr_++);
      }
    }
  }
breakLoop:
  // If the template literal is tagged and contains invalid escapes, then
  // cooked should be null because there is no way to cook it, per the ESTree
  // 2018 spec. The parser will error when encountering an untagged literal
  // with invalid escapes, so we place nullptr here.
  UniqueString *cookedStr =
      foundNotEscapeSequence ? nullptr : getStringLiteral(tmpStorage_.str());
  UniqueString *rawStr = getStringLiteral(rawStorage_.str());
  if (isHead) {
    if (isTail) {
      // ` characters `
      token_.setTemplateLiteral(
          TokenKind::no_substitution_template, cookedStr, rawStr);
    } else {
      // ` characters ${
      token_.setTemplateLiteral(TokenKind::template_head, cookedStr, rawStr);
    }
  } else {
    if (isTail) {
      // } characters `
      token_.setTemplateLiteral(TokenKind::template_tail, cookedStr, rawStr);
    } else {
      // } characters ${
      token_.setTemplateLiteral(TokenKind::template_middle, cookedStr, rawStr);
    }
  }
}