in Source/PLCrashSysctl.c [141:223]
size_t plcrash_sysctl_valid_utf8_bytes_max (const uint8_t *s, size_t maxlen) {
/*
* For the official specification documenting the multibyte encoding, refer to:
* The Unicode Standard, Version 6.2 - Core Specification
* Chapter 3, Section 9 - Unicode Encoding Forms
*
* UTF-8 uses a variable-width encoding, with each code point corresponding to
* a 1, 2, 3, or 4 byte sequence.
*
* +---------------------+----------+----------+----------+----------+
* | Code Point Bit Size | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
* +---------------------+----------+----------+----------+----------+
* | 7 | 0xxxxxxx | | | |
* | 11 | 110xxxxx | 10xxxxxx | | |
* | 16 | 1110xxxx | 10xxxxxx | 10xxxxxx | |
* | 21 | 11110xxx | 10xxxxxx | 10xxxxxx | 10xxxxxx |
* +---------------------+----------+----------+----------+----------+
*/
/* The currently string byte position */
size_t len = 0;
/* Handle (and skip) an initial BOM */
if (maxlen >= 3 && s[0] == 0xEF && s[1] == 0xBB && s[2] == 0xBF)
len += 3;
/* Work forwards, validating UTF-8 character ranges as we go. */
for (; len < maxlen && s[len] != '\0'; len++) {
uint8_t c = s[len];
/* Determine the sequence length */
size_t seqlen = 0;
if ((c & 0x80) == 0) {
/* 1 byte sequence. Code point value range is 0 to 127. */
seqlen = 0;
continue;
} else if ((c & 0xE0) == 0xC0) {
/* 1 byte continuation of a 2 byte sequence. Code point value range is 128 to 2047 */
seqlen = 1;
} else if ((c & 0xF0) == 0xE0) {
/* 2 byte continuation of a 3 byte sequence. Code point value range is 2048 to 55295 and 57344 to 65535 */
seqlen = 2;
} else if ((c & 0xF8) == 0xF0) {
/* 3 byte continuation of a 4 byte sequence. Code point value range is 65536 to 1114111 */
seqlen = 3;
} else {
/* Invalid UTF-8 character (eg, >= 128) */
return len;
}
/* Verify that the sequence (including the now validated but uncounted leading byte) fits within maxlen */
if (maxlen - (len + 1) < seqlen)
return len;
/* Validate the sequence's trailing bytes */
size_t validated = 0;
for (size_t i = 0; i < seqlen; i++) {
uint8_t trailer = s[len + i + 1]; /* len + i + already-validated-byte */
/* This byte must be a UTF-8 trailing byte. If not, then return the length, minus this
* incomplete multibyte sequence */
if (trailer == '\0' || (trailer & 0xC0) != 0x80)
return len;
/* Mark position as validated */
validated++;
}
if (validated == seqlen) {
/* Fully validated */
len += seqlen;
} else {
/* Couldn't validate the sequence; return the length up to (but not including) the invalid sequence. */
return len;
}
}
return len;
}