static Utf8DecoderResult isValidUtf8Codepoint()

in runtime/under-codecs-module.cpp [658:758]


static Utf8DecoderResult isValidUtf8Codepoint(const Byteslike& bytes,
                                              word index) {
  word length = bytes.length();
  byte ch = bytes.byteAt(index);
  if (ch <= kMaxASCII) {
    return k1Byte;
  }
  if (ch < 0xE0) {
    // \xC2\x80-\xDF\xBF -- 0080-07FF
    if (ch < 0xC2) {
      // invalid sequence
      // \x80-\xBF -- continuation byte
      // \xC0-\xC1 -- fake 0000-007F
      return kInvalidStart;
    }
    if (index + 1 >= length) {
      return kUnexpectedEndOfData;
    }
    if (!UTF8::isTrailByte(bytes.byteAt(index + 1))) {
      return kInvalidContinuation1;
    }
    return k2Byte;
  }
  if (ch < 0xF0) {
    // \xE0\xA0\x80-\xEF\xBF\xBF -- 0800-FFFF
    if (index + 2 >= length) {
      if (index + 1 >= length) {
        return kUnexpectedEndOfData;
      }
      byte ch2 = bytes.byteAt(index + 1);
      if (!UTF8::isTrailByte(ch2) || (ch2 < 0xA0 ? ch == 0xE0 : ch == 0xED)) {
        return kInvalidContinuation1;
      }
      return kUnexpectedEndOfData;
    }
    byte ch2 = bytes.byteAt(index + 1);
    if (!UTF8::isTrailByte(ch2)) {
      return kInvalidContinuation1;
    }
    if (ch == 0xE0) {
      if (ch2 < 0xA0) {
        // invalid sequence
        // \xE0\x80\x80-\xE0\x9F\xBF -- fake 0000-0800
        return kInvalidContinuation1;
      }
    } else if (ch == 0xED && ch2 >= 0xA0) {
      // Decoding UTF-8 sequences in range \xED\xA0\x80-\xED\xBF\xBF
      // will result in surrogates in range D800-DFFF. Surrogates are
      // not valid UTF-8 so they are rejected.
      // See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
      // (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
      return kInvalidContinuation1;
    }
    if (!UTF8::isTrailByte(bytes.byteAt(index + 2))) {
      return kInvalidContinuation2;
    }
    return k3Byte;
  }
  if (ch < 0xF5) {
    // \xF0\x90\x80\x80-\xF4\x8F\xBF\xBF -- 10000-10FFFF
    if (index + 3 >= length) {
      if (index + 1 >= length) {
        return kUnexpectedEndOfData;
      }
      byte ch2 = bytes.byteAt(index + 1);
      if (!UTF8::isTrailByte(ch2) || (ch2 < 0x90 ? ch == 0xF0 : ch == 0xF4)) {
        return kInvalidContinuation1;
      }
      if (index + 2 >= length) {
        return kUnexpectedEndOfData;
      }
      if (!UTF8::isTrailByte(bytes.byteAt(index + 2))) {
        return kInvalidContinuation2;
      }
      return kUnexpectedEndOfData;
    }
    byte ch2 = bytes.byteAt(index + 1);
    if (!UTF8::isTrailByte(ch2)) {
      return kInvalidContinuation1;
    }
    if (ch == 0xF0) {
      if (ch2 < 0x90) {
        // invalid sequence
        // \xF0\x80\x80\x80-\xF0\x8F\xBF\xBF -- fake 0000-FFFF
        return kInvalidContinuation1;
      }
    } else if (ch == 0xF4 && ch2 >= 0x90) {
      // invalid sequence
      // \xF4\x90\x80\80- -- 110000- overflow
      return kInvalidContinuation1;
    }
    if (!UTF8::isTrailByte(bytes.byteAt(index + 2))) {
      return kInvalidContinuation2;
    }
    if (!UTF8::isTrailByte(bytes.byteAt(index + 3))) {
      return kInvalidContinuation3;
    }
    return k4Byte;
  }
  return kInvalidStart;
}