private Encoding DetectEncoding()

in Amazon.KinesisTap.FileSystem/AbstractLineProcessor.cs [374:429]


        private Encoding DetectEncoding(out int consumed)
        {
            consumed = 0;
            if (_len < 2)
            {
                return null;
            }

            var byteBuffer = new Span<byte>(_buffer, _startPos, _len);

            if (byteBuffer[0] == 0xFE && byteBuffer[1] == 0xFF)
            {
                consumed = 2;
                // Big Endian Unicode
                return new UnicodeEncoding(true, true);
            }
            else if (byteBuffer[0] == 0xFF && byteBuffer[1] == 0xFE)
            {
                // Little Endian Unicode, or possibly little endian UTF32
                if (_len < 4 || byteBuffer[2] != 0 || byteBuffer[3] != 0)
                {
                    consumed = 2;
                    return new UnicodeEncoding(false, true);
                }
                if (CurrentEncoding is UnicodeEncoding)
                {
                    // now, there is a case here when the intended encoding is actually UTF16,
                    // but the stream starts with 0x00 0x00 so there's really no way for the parser to differ it from UTF-32
                    // in this case we need to rely on the user-provided encoding
                    consumed = 2;
                    return CurrentEncoding;
                }

                consumed = 4;
                return new UTF32Encoding(false, true);
            }
            else if (_len >= 3 && byteBuffer[0] == 0xEF && byteBuffer[1] == 0xBB && byteBuffer[2] == 0xBF)
            {
                // UTF-8
                consumed = 3;
                return Encoding.UTF8;
            }
            else if (_len >= 4 && byteBuffer[0] == 0 && byteBuffer[1] == 0 && byteBuffer[2] == 0xFE && byteBuffer[3] == 0xFF)
            {
                // Big Endian UTF32
                consumed = 4;
                return new UTF32Encoding(true, true);
            }
            else if (_len == 2)
            {
                return null;
            }

            // sufficient bytes have been read but no BOM found, we assume UTF8
            return Encoding.UTF8;
        }