bool s3fs_wtf8_encode()

in src/string_util.cpp [523:591]


bool s3fs_wtf8_encode(const char *s, std::string *result)
{
    bool invalid = false;

    // Pass valid utf8 code through
    for (; *s; s++) {
        const unsigned char c = *s;

        // single byte encoding
        if (c <= 0x7f) {
            if (result) {
                *result += c;
            }
            continue;
        }

        // otherwise, it must be one of the valid start bytes
        if ( c >= 0xc2 && c <= 0xf5 ) {
            // two byte encoding
            // don't need bounds check, std::string is zero terminated
            if ((c & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80) {
                // all two byte encodings starting higher than c1 are valid
                if (result) {
                    *result += c;
                    *result += *(++s);
                }
                continue;
            } 
            // three byte encoding
            if ((c & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80 && (s[2] & 0xc0) == 0x80) {
                const unsigned code = ((c & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f);
                if (code >= 0x800 && ! (code >= 0xd800 && code <= 0xd8ff)) {
                    // not overlong and not a surrogate pair 
                    if (result) {
                        *result += c;
                        *result += *(++s);
                        *result += *(++s);
                    }
                    continue;
                }
            }
            // four byte encoding
            if ((c & 0xf8) == 0xf0 && (s[1] & 0xc0) == 0x80 && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80) {
                const unsigned code = ((c & 0x07) << 18) | ((s[1] & 0x3f) << 12) | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
                if (code >= 0x10000 && code <= 0x10ffff) {
                  // not overlong and in defined unicode space
                  if (result) {
                      *result += c;
                      *result += *(++s);
                      *result += *(++s);
                      *result += *(++s);
                  }
                  continue;
                }
            }
        }
        // printf("invalid %02x at %d\n", c, i);
        // Invalid utf8 code.  Convert it to a private two byte area of unicode
        // e.g. the e000 - f8ff area.  This will be a three byte encoding
        invalid = true;
        if (result) {
            unsigned escape = escape_base + c;
            *result += static_cast<char>(0xe0 | ((escape >> 12) & 0x0f));
            *result += static_cast<char>(0x80 | ((escape >> 06) & 0x3f));
            *result += static_cast<char>(0x80 | ((escape >> 00) & 0x3f));
        }
    }
    return invalid;
}