static int utf8canon()

in libs/base/core.cpp [124:183]


static int utf8canon(char *dst, const char *data, int size) {
    int outsz = 0;
    for (int i = 0; i < size;) {
        uint8_t c = data[i];
        uint32_t charCode = c;
        if ((c & 0x80) == 0x00) {
            charCode = c;
            i++;
        } else if ((c & 0xe0) == 0xc0 && i + 1 < size && (data[i + 1] & 0xc0) == 0x80) {
            charCode = ((c & 0x1f) << 6) | (data[i + 1] & 0x3f);
            if (charCode < 0x80)
                goto error;
            else
                i += 2;
        } else if ((c & 0xf0) == 0xe0 && i + 2 < size && (data[i + 1] & 0xc0) == 0x80 &&
                   (data[i + 2] & 0xc0) == 0x80) {
            charCode = ((c & 0x0f) << 12) | (data[i + 1] & 0x3f) << 6 | (data[i + 2] & 0x3f);
            // don't exclude surrogate pairs, since we're generating them
            if (charCode < 0x800 /*|| (0xd800 <= charCode && charCode <= 0xdfff)*/)
                goto error;
            else
                i += 3;
        } else if ((c & 0xf8) == 0xf0 && i + 3 < size && (data[i + 1] & 0xc0) == 0x80 &&
                   (data[i + 2] & 0xc0) == 0x80 && (data[i + 3] & 0xc0) == 0x80) {
            charCode = ((c & 0x07) << 18) | (data[i + 1] & 0x3f) << 12 | (data[i + 2] & 0x3f) << 6 |
                       (data[i + 3] & 0x3f);
            if (charCode < 0x10000 || charCode > 0x10ffff)
                goto error;
            else
                i += 4;
        } else {
            goto error;
        }

        if (charCode < 0x80) {
            outsz += 1;
            if (dst)
                *dst++ = charCode;
        } else if (charCode < 0x800) {
            outsz += 2;
            dst = write2byte(dst, charCode);
        } else if (charCode < 0x10000) {
            outsz += 3;
            dst = write3byte(dst, charCode);
        } else {
            outsz += 6; // a surrogate pair
            charCode -= 0x10000;
            dst = write3byte(dst, 0xd800 + (charCode >> 10));
            dst = write3byte(dst, 0xdc00 + (charCode & 0x3ff));
        }

        continue;

    error:
        i++;
        outsz += 2;
        dst = write2byte(dst, c);
    }
    return outsz;
}