in src/string_util.cpp [523:591]
bool s3fs_wtf8_encode(const char *s, std::string *result)
{
bool invalid = false;
// Pass valid utf8 code through
for (; *s; s++) {
const unsigned char c = *s;
// single byte encoding
if (c <= 0x7f) {
if (result) {
*result += c;
}
continue;
}
// otherwise, it must be one of the valid start bytes
if ( c >= 0xc2 && c <= 0xf5 ) {
// two byte encoding
// don't need bounds check, std::string is zero terminated
if ((c & 0xe0) == 0xc0 && (s[1] & 0xc0) == 0x80) {
// all two byte encodings starting higher than c1 are valid
if (result) {
*result += c;
*result += *(++s);
}
continue;
}
// three byte encoding
if ((c & 0xf0) == 0xe0 && (s[1] & 0xc0) == 0x80 && (s[2] & 0xc0) == 0x80) {
const unsigned code = ((c & 0x0f) << 12) | ((s[1] & 0x3f) << 6) | (s[2] & 0x3f);
if (code >= 0x800 && ! (code >= 0xd800 && code <= 0xd8ff)) {
// not overlong and not a surrogate pair
if (result) {
*result += c;
*result += *(++s);
*result += *(++s);
}
continue;
}
}
// four byte encoding
if ((c & 0xf8) == 0xf0 && (s[1] & 0xc0) == 0x80 && (s[2] & 0xc0) == 0x80 && (s[3] & 0xc0) == 0x80) {
const unsigned code = ((c & 0x07) << 18) | ((s[1] & 0x3f) << 12) | ((s[2] & 0x3f) << 6) | (s[3] & 0x3f);
if (code >= 0x10000 && code <= 0x10ffff) {
// not overlong and in defined unicode space
if (result) {
*result += c;
*result += *(++s);
*result += *(++s);
*result += *(++s);
}
continue;
}
}
}
// printf("invalid %02x at %d\n", c, i);
// Invalid utf8 code. Convert it to a private two byte area of unicode
// e.g. the e000 - f8ff area. This will be a three byte encoding
invalid = true;
if (result) {
unsigned escape = escape_base + c;
*result += static_cast<char>(0xe0 | ((escape >> 12) & 0x0f));
*result += static_cast<char>(0x80 | ((escape >> 06) & 0x3f));
*result += static_cast<char>(0x80 | ((escape >> 00) & 0x3f));
}
}
return invalid;
}