in language-extensions/R/common/src/Unicode.cpp [882:941]
bool IsValidUTF8(const std::string &str, int numberOfBytesInStr)
{
int trailingBytesToValidate = 0;
for (int i = 0; i < numberOfBytesInStr; ++i)
{
int c = (unsigned char) str[i];
if (0x00 <= c && c <= 0x7f)
{
// 0xxxxxxx -> One byte
//
trailingBytesToValidate = OneChar8Len - 1;
}
else if ((c & 0xE0) == 0xC0)
{
// 110xxxxx -> Two bytes
//
trailingBytesToValidate = TwoChar8Len - 1;
}
else if ((c & 0xF0) == 0xE0)
{
// 1110xxxx -> Three bytes
//
trailingBytesToValidate = ThreeChar8Len - 1;
}
else if ((c & 0xF8) == 0xF0)
{
// 11110xxx -> Four bytes
//
trailingBytesToValidate = FourChar8Len - 1;
}
else
{
return false;
}
// trailingBytesToValidate bytes should match 10xxxxxx
//
for (int j = 0; j < trailingBytesToValidate && i < numberOfBytesInStr; ++j)
{
++i;
// Expect to have trailingBytesToValidate bytes, but ended soon.
//
if (i == numberOfBytesInStr)
{
return false;
}
// If following byte does not match 10xxxxxx
//
if (((unsigned char)str[i] & 0xC0) != 0x80)
{
return false;
}
}
}
return true;
}