pkg/preprocess/bom.go (28 lines of code) (raw):
package preprocess
import (
"bytes"
"golang.org/x/text/encoding/unicode"
)
var bomSequences = [][]byte{
{'\xef', '\xbb', '\xbf'}, // python: codecs.BOM_UTF8
{'\xff', '\xfe'}, // python: codecs.BOM, codecs.BOM_LE, codecs.BOM_UTF16_LE
{'\xfe', '\xff'}, // python: codecs.BOM_BE, codecs.BOM_UTF16_BE
}
// RemoveBOM trims the BOM prefix from provided the data and converts
// the text to UTF-8 if it was encoded as UTF-16 with BOM.
func RemoveBOM(b []byte) []byte {
b = encodeToUTF8(b)
for _, bs := range bomSequences {
if bytes.HasPrefix(b, bs) {
return b[len(bs):]
}
}
return b
}
// encodeUTF8 detects and converts utf16 to utf8 and returns a new slice.
// If the encoding is already correct or given utf16 content is without
// BOM, the provided slice is returned.
func encodeToUTF8(b []byte) []byte {
var e unicode.Endianness // unused as we'll ExpectBOM
// if b is not utf16 with bom, decoding will terminate early
// with unicode.ErrMissingBOM.
utf16Encoding := unicode.UTF16(e, unicode.ExpectBOM)
utf8Bytes, err := utf16Encoding.NewDecoder().Bytes(b)
if err != nil {
// if we got err == unicode.ErrMissingBOM input is already utf8 but in
// case of other errors, we return the given slice anyway as this is
// best-effort.
return b
}
return utf8Bytes // decoded from utf16
}