size_t plcrash_sysctl_valid_utf8_bytes_max()

in Source/PLCrashSysctl.c [141:223]


size_t plcrash_sysctl_valid_utf8_bytes_max (const uint8_t *s, size_t maxlen) {
    /*
     * For the official specification documenting the multibyte encoding, refer to:
     *      The Unicode Standard, Version 6.2 - Core Specification
     *          Chapter 3, Section 9 - Unicode Encoding Forms
     *
     * UTF-8 uses a variable-width encoding, with each code point corresponding to
     * a 1, 2, 3, or 4 byte sequence.
     *
     * +---------------------+----------+----------+----------+----------+
     * | Code Point Bit Size | 1st Byte | 2nd Byte | 3rd Byte | 4th Byte |
     * +---------------------+----------+----------+----------+----------+
     * | 7                   | 0xxxxxxx |          |          |          |
     * | 11                  | 110xxxxx | 10xxxxxx |          |          |
     * | 16                  | 1110xxxx | 10xxxxxx | 10xxxxxx |          |
     * | 21                  | 11110xxx | 10xxxxxx | 10xxxxxx | 10xxxxxx |
     * +---------------------+----------+----------+----------+----------+
     */
    
    /* The currently string byte position */
    size_t len = 0;
    
    /* Handle (and skip) an initial BOM */
    if (maxlen >= 3 && s[0] == 0xEF && s[1] == 0xBB && s[2] == 0xBF)
        len += 3;

    /* Work forwards, validating UTF-8 character ranges as we go. */
    for (; len < maxlen && s[len] != '\0'; len++) {
        uint8_t c = s[len];

        /* Determine the sequence length */
        size_t seqlen = 0;
        if ((c & 0x80) == 0) {
            /* 1 byte sequence. Code point value range is 0 to 127. */
            seqlen = 0;
            continue;
            
        } else if ((c & 0xE0) == 0xC0) {
            /* 1 byte continuation of a 2 byte sequence. Code point value range is 128 to 2047 */
            seqlen = 1;
            
        } else if ((c & 0xF0) == 0xE0) {
            /* 2 byte continuation of a 3 byte sequence. Code point value range is 2048 to 55295 and 57344 to 65535 */
            seqlen = 2;
            
        } else if ((c & 0xF8) == 0xF0) {
            /* 3 byte continuation of a 4 byte sequence. Code point value range is 65536 to 1114111 */
            seqlen = 3;
            
        } else {
            /* Invalid UTF-8 character (eg, >= 128) */
            return len;
        }
        
        /* Verify that the sequence (including the now validated but uncounted leading byte) fits within maxlen */
        if (maxlen - (len + 1) < seqlen)
            return len;
        
        /* Validate the sequence's trailing bytes */
        size_t validated = 0;
        for (size_t i = 0; i < seqlen; i++) {
            uint8_t trailer = s[len + i + 1]; /* len + i + already-validated-byte */

            /* This byte must be a UTF-8 trailing byte. If not, then return the length, minus this
             * incomplete multibyte sequence */
            if (trailer == '\0' || (trailer & 0xC0) != 0x80)
                return len;
            
            /* Mark position as validated */
            validated++;
        }

        if (validated == seqlen) {
            /* Fully validated */
            len += seqlen;
        } else {
            /* Couldn't validate the sequence; return the length up to (but not including) the invalid sequence. */
            return len;
        }
    }
    
    return len;
}