in subversion/libsvn_subr/utf8proc/utf8proc.c [113:159]
UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_iterate(
const utf8proc_uint8_t *str, utf8proc_ssize_t strlen, utf8proc_int32_t *dst
) {
utf8proc_uint32_t uc;
const utf8proc_uint8_t *end;
*dst = -1;
if (!strlen) return 0;
end = str + ((strlen < 0) ? 4 : strlen);
uc = *str++;
if (uc < 0x80) {
*dst = uc;
return 1;
}
/* Must be between 0xc2 and 0xf4 inclusive to be valid */
if ((uc - 0xc2) > (0xf4-0xc2)) return UTF8PROC_ERROR_INVALIDUTF8;
if (uc < 0xe0) { /* 2-byte sequence */
/* Must have valid continuation character */
if (str >= end || !utf_cont(*str)) return UTF8PROC_ERROR_INVALIDUTF8;
*dst = ((uc & 0x1f)<<6) | (*str & 0x3f);
return 2;
}
if (uc < 0xf0) { /* 3-byte sequence */
if ((str + 1 >= end) || !utf_cont(*str) || !utf_cont(str[1]))
return UTF8PROC_ERROR_INVALIDUTF8;
/* Check for surrogate chars */
if (uc == 0xed && *str > 0x9f)
return UTF8PROC_ERROR_INVALIDUTF8;
uc = ((uc & 0xf)<<12) | ((*str & 0x3f)<<6) | (str[1] & 0x3f);
if (uc < 0x800)
return UTF8PROC_ERROR_INVALIDUTF8;
*dst = uc;
return 3;
}
/* 4-byte sequence
Must have 3 valid continuation characters */
if ((str + 2 >= end) || !utf_cont(*str) || !utf_cont(str[1]) || !utf_cont(str[2]))
return UTF8PROC_ERROR_INVALIDUTF8;
/* Make sure in correct range (0x10000 - 0x10ffff) */
if (uc == 0xf0) {
if (*str < 0x90) return UTF8PROC_ERROR_INVALIDUTF8;
} else if (uc == 0xf4) {
if (*str > 0x8f) return UTF8PROC_ERROR_INVALIDUTF8;
}
*dst = ((uc & 7)<<18) | ((*str & 0x3f)<<12) | ((str[1] & 0x3f)<<6) | (str[2] & 0x3f);
return 4;
}