bool IsValidUTF8()

in language-extensions/R/common/src/Unicode.cpp [882:941]


bool IsValidUTF8(const std::string &str, int numberOfBytesInStr)
{
	int trailingBytesToValidate = 0;
	
	for (int i = 0; i < numberOfBytesInStr; ++i)
	{
		int c = (unsigned char) str[i];
		if (0x00 <= c && c <= 0x7f) 
		{
			// 0xxxxxxx -> One byte
			//
			trailingBytesToValidate = OneChar8Len - 1;
		}
		else if ((c & 0xE0) == 0xC0)
		{
			// 110xxxxx -> Two bytes
			//
			trailingBytesToValidate = TwoChar8Len - 1;
		}
		else if ((c & 0xF0) == 0xE0)
		{
			// 1110xxxx -> Three bytes
			//
			trailingBytesToValidate = ThreeChar8Len - 1;
		}
		else if ((c & 0xF8) == 0xF0)
		{
			// 11110xxx -> Four bytes
			//
			trailingBytesToValidate = FourChar8Len - 1;
		}
		else 
		{
			return false;
		}
		
		// trailingBytesToValidate bytes should match 10xxxxxx
		//
		for (int j = 0; j < trailingBytesToValidate && i < numberOfBytesInStr; ++j)
		{
			++i;

			// Expect to have trailingBytesToValidate bytes, but ended soon.
			//
			if (i == numberOfBytesInStr)
			{
				return false;
			}

			// If following byte does not match 10xxxxxx
			//
			if (((unsigned char)str[i] & 0xC0) != 0x80)
			{
				return false;
			}
		}
	}

	return true;
}