in Amazon.KinesisTap.FileSystem/AbstractLineProcessor.cs [374:429]
private Encoding DetectEncoding(out int consumed)
{
consumed = 0;
if (_len < 2)
{
return null;
}
var byteBuffer = new Span<byte>(_buffer, _startPos, _len);
if (byteBuffer[0] == 0xFE && byteBuffer[1] == 0xFF)
{
consumed = 2;
// Big Endian Unicode
return new UnicodeEncoding(true, true);
}
else if (byteBuffer[0] == 0xFF && byteBuffer[1] == 0xFE)
{
// Little Endian Unicode, or possibly little endian UTF32
if (_len < 4 || byteBuffer[2] != 0 || byteBuffer[3] != 0)
{
consumed = 2;
return new UnicodeEncoding(false, true);
}
if (CurrentEncoding is UnicodeEncoding)
{
// now, there is a case here when the intended encoding is actually UTF16,
// but the stream starts with 0x00 0x00 so there's really no way for the parser to differ it from UTF-32
// in this case we need to rely on the user-provided encoding
consumed = 2;
return CurrentEncoding;
}
consumed = 4;
return new UTF32Encoding(false, true);
}
else if (_len >= 3 && byteBuffer[0] == 0xEF && byteBuffer[1] == 0xBB && byteBuffer[2] == 0xBF)
{
// UTF-8
consumed = 3;
return Encoding.UTF8;
}
else if (_len >= 4 && byteBuffer[0] == 0 && byteBuffer[1] == 0 && byteBuffer[2] == 0xFE && byteBuffer[3] == 0xFF)
{
// Big Endian UTF32
consumed = 4;
return new UTF32Encoding(true, true);
}
else if (_len == 2)
{
return null;
}
// sufficient bytes have been read but no BOM found, we assume UTF8
return Encoding.UTF8;
}