in libs/base/core.cpp [124:183]
static int utf8canon(char *dst, const char *data, int size) {
int outsz = 0;
for (int i = 0; i < size;) {
uint8_t c = data[i];
uint32_t charCode = c;
if ((c & 0x80) == 0x00) {
charCode = c;
i++;
} else if ((c & 0xe0) == 0xc0 && i + 1 < size && (data[i + 1] & 0xc0) == 0x80) {
charCode = ((c & 0x1f) << 6) | (data[i + 1] & 0x3f);
if (charCode < 0x80)
goto error;
else
i += 2;
} else if ((c & 0xf0) == 0xe0 && i + 2 < size && (data[i + 1] & 0xc0) == 0x80 &&
(data[i + 2] & 0xc0) == 0x80) {
charCode = ((c & 0x0f) << 12) | (data[i + 1] & 0x3f) << 6 | (data[i + 2] & 0x3f);
// don't exclude surrogate pairs, since we're generating them
if (charCode < 0x800 /*|| (0xd800 <= charCode && charCode <= 0xdfff)*/)
goto error;
else
i += 3;
} else if ((c & 0xf8) == 0xf0 && i + 3 < size && (data[i + 1] & 0xc0) == 0x80 &&
(data[i + 2] & 0xc0) == 0x80 && (data[i + 3] & 0xc0) == 0x80) {
charCode = ((c & 0x07) << 18) | (data[i + 1] & 0x3f) << 12 | (data[i + 2] & 0x3f) << 6 |
(data[i + 3] & 0x3f);
if (charCode < 0x10000 || charCode > 0x10ffff)
goto error;
else
i += 4;
} else {
goto error;
}
if (charCode < 0x80) {
outsz += 1;
if (dst)
*dst++ = charCode;
} else if (charCode < 0x800) {
outsz += 2;
dst = write2byte(dst, charCode);
} else if (charCode < 0x10000) {
outsz += 3;
dst = write3byte(dst, charCode);
} else {
outsz += 6; // a surrogate pair
charCode -= 0x10000;
dst = write3byte(dst, 0xd800 + (charCode >> 10));
dst = write3byte(dst, 0xdc00 + (charCode & 0x3ff));
}
continue;
error:
i++;
outsz += 2;
dst = write2byte(dst, c);
}
return outsz;
}